diff --git a/.actions/README.md b/.actions/README.md new file mode 100644 index 0000000..7eb3226 --- /dev/null +++ b/.actions/README.md @@ -0,0 +1,11 @@ +scripts for generating notebooks + +**GHA here** + +- generate notebooks +- flow to ban any added notebook in PR (fail if changes in .notebooks) + +**PL side** + +- git submodule with these examples +- gha cron to update submodule head diff --git a/.actions/assistant.py b/.actions/assistant.py new file mode 100644 index 0000000..f489308 --- /dev/null +++ b/.actions/assistant.py @@ -0,0 +1,713 @@ +import base64 +import json +import os +import re +from datetime import datetime +from shutil import copyfile +from textwrap import wrap +from typing import Any, Dict, List, Optional, Sequence, Tuple +from warnings import warn + +import fire +import requests +import tqdm +import yaml +from pip._internal.operations import freeze +from wcmatch import glob + +_PATH_HERE = os.path.dirname(__file__) +_PATH_ROOT = os.path.dirname(_PATH_HERE) +PATH_REQ_DEFAULT = os.path.join(_PATH_ROOT, "_requirements", "default.txt") +PATH_SCRIPT_RENDER = os.path.join(_PATH_HERE, "_ipynb-render.sh") +PATH_SCRIPT_TEST = os.path.join(_PATH_HERE, "_ipynb-test.sh") +# https://askubuntu.com/questions/909918/how-to-show-unzip-progress +UNZIP_PROGRESS_BAR = ' | awk \'BEGIN {ORS=" "} {if(NR%10==0)print "."}\'' +REPO_NAME = "lightning-tutorials" +COLAB_REPO_LINK = "https://colab.research.google.com/github/PytorchLightning" +BRANCH_DEFAULT = "main" +BRANCH_PUBLISHED = "publication" +DIR_NOTEBOOKS = ".notebooks" +URL_PL_DOWNLOAD = f"https://github.com/Lightning-AI/{REPO_NAME}/raw/{BRANCH_DEFAULT}" +TEMPLATE_HEADER = f"""# %%%% [markdown] +# +# # %(title)s +# +# * **Author:** %(author)s +# * **License:** %(license)s +# * **Generated:** %(generated)s +# +# %(description)s +# +# --- +# Open in [![Open In Colab](https://colab.research.google.com/assets/colab-badge.png){{height="20px" width="117px"}}]({COLAB_REPO_LINK}/{REPO_NAME}/blob/{BRANCH_PUBLISHED}/{DIR_NOTEBOOKS}/%(local_ipynb)s) +# +# Give us a ⭐ [on Github](https://www.github.com/Lightning-AI/lightning/) +# | Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/stable/) +# | Join us [on Slack](https://www.pytorchlightning.ai/community) + +""" +TEMPLATE_SETUP = """# %%%% [markdown] +# ## Setup +# This notebook requires some packages besides pytorch-lightning. + +# %%%% colab={} colab_type="code" id="LfrJLKPFyhsK" +# ! pip install --quiet %(requirements)s + +""" +TEMPLATE_FOOTER = """ +# %% [markdown] +# ## Congratulations - Time to Join the Community! +# +# Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning +# movement, you can do so in the following ways! +# +# ### Star [Lightning](https://github.com/Lightning-AI/lightning) on GitHub +# The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool +# tools we're building. +# +# ### Join our [Slack](https://www.pytorchlightning.ai/community)! +# The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself +# and share your interests in `#general` channel +# +# +# ### Contributions ! +# The best way to contribute to our community is to become a code contributor! At any time you can go to +# [Lightning](https://github.com/Lightning-AI/lightning) or [Bolt](https://github.com/Lightning-AI/lightning-bolts) +# GitHub Issues page and filter for "good first issue". +# +# * [Lightning good first issue](https://github.com/Lightning-AI/lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) +# * [Bolt good first issue](https://github.com/Lightning-AI/lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) +# * You can also contribute your own notebooks with useful examples ! +# +# ### Great thanks from the entire Pytorch Lightning Team for your interest ! +# +# [![Pytorch Lightning](https://raw.githubusercontent.com/Lightning-AI/lightning/master/docs/source/_static/images/logo.png){height="60px" width="240px"}](https://pytorchlightning.ai) + +""" +TEMPLATE_CARD_ITEM = """ +.. customcarditem:: + :header: %(title)s + :card_description: %(short_description)s + :tags: %(tags)s +""" + + +def load_requirements(path_req: str = PATH_REQ_DEFAULT) -> list: + """Load the requirements from a file.""" + with open(path_req) as fp: + req = fp.readlines() + req = [r[: r.index("#")] if "#" in r else r for r in req] + req = [r.strip() for r in req] + req = [r for r in req if r] + return req + + +def get_running_cuda_version() -> str: + """Extract the version of actual CUDA for this runtime.""" + try: + import torch + + return torch.version.cuda or "" + except ImportError: + return "" + + +def get_running_torch_version(): + """Extract the version of actual PyTorch for this runtime.""" + try: + import torch + + ver = torch.__version__ + return ver[: ver.index("+")] if "+" in ver else ver + except ImportError: + return "" + + +_TORCH_VERSION = get_running_torch_version() +_CUDA_VERSION = get_running_cuda_version() +_RUNTIME_VERSIONS = dict( + TORCH_VERSION_FULL=_TORCH_VERSION, + TORCH_VERSION=_TORCH_VERSION[: _TORCH_VERSION.index("+")] if "+" in _TORCH_VERSION else _TORCH_VERSION, + TORCH_MAJOR_DOT_MINOR=".".join(_TORCH_VERSION.split(".")[:2]), + CUDA_VERSION=_CUDA_VERSION, + CUDA_MAJOR_MINOR=_CUDA_VERSION.replace(".", ""), + DEVICE=f"cu{_CUDA_VERSION.replace('.', '')}" if _CUDA_VERSION else "cpu", +) + + +class AssistantCLI: + """Collection of handy CLI commands.""" + + _LOCAL_ACCELERATOR = "cpu,gpu" if get_running_cuda_version() else "cpu" + DEVICE_ACCELERATOR = os.environ.get("ACCELERATOR", _LOCAL_ACCELERATOR).lower() + DATASETS_FOLDER = os.environ.get("PATH_DATASETS", "_datasets") + DRY_RUN = bool(int(os.environ.get("DRY_RUN", 0))) + _META_REQUIRED_FIELDS = ("title", "author", "license", "description") + _SKIP_DIRS = ( + ".actions", + ".azure", + ".datasets", + ".github", + "_docs", + "_TEMP", + "_requirements", + DIR_NOTEBOOKS, + ) + _META_FILE_REGEX = ".meta.{yaml,yml}" + _META_PIP_KEY = "pip__" + _META_ACCEL_DEFAULT = _LOCAL_ACCELERATOR.split(",") + + # Map directory names to tag names. Note that dashes will be replaced with spaces in rendered tags in the docs. + _DIR_TO_TAG = { + "course_UvA-DL": "UvA-DL-Course", + "lightning_examples": "Lightning-Examples", + "flash_tutorials": "Kaggle", + } + _BASH_SCRIPT_BASE = ("#!/bin/bash", "set -e", "") + _EXT_ARCHIVE_ZIP = (".zip",) + _EXT_ARCHIVE_TAR = (".tar", ".gz") + _EXT_ARCHIVE = _EXT_ARCHIVE_ZIP + _EXT_ARCHIVE_TAR + _AZURE_POOL = "lit-rtx-3090" + _AZURE_DOCKER = "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + + @staticmethod + def _find_meta(folder: str) -> str: + """Search for a meta file in given folder and return its path. + + Args: + folder: path to the folder with python script, meta and artefacts + """ + files = glob.glob(os.path.join(folder, AssistantCLI._META_FILE_REGEX), flags=glob.BRACE) + if len(files) == 1: + return files[0] + return "" + + @staticmethod + def _load_meta(folder: str, strict: bool = False) -> Optional[dict]: + """Loading meta-data for a particular notebook with given folder path. + + Args: + folder: path to the folder with python script, meta and artefacts + strict: raise error if meta is missing required feilds + """ + fpath = AssistantCLI._find_meta(folder) + assert fpath, f"Missing meta file in folder: {folder}" + meta = yaml.safe_load(open(fpath)) + + if strict: + meta_miss = [fl for fl in AssistantCLI._META_REQUIRED_FIELDS if fl not in meta] + if meta_miss: + raise ValueError(f"Meta file '{fpath}' is missing the following fields: {meta_miss}") + return meta + + @staticmethod + def _valid_conf_folder(folder: str) -> Tuple[str, str]: + """Validate notebook folder if it has required meta file and optional thumb. + + Args: + folder: path to the folder with python script, meta and artefacts + """ + meta_files = [os.path.join(folder, f".meta.{ext}") for ext in ("yml", "yaml")] + meta_files = [pf for pf in meta_files if os.path.isfile(pf)] + if len(meta_files) != 1: + raise FileExistsError(f"found {len(meta_files)} meta (yaml|yml) files in folder: {folder}") + thumb_files = glob.glob(os.path.join(folder, ".thumb.*")) + thumb_names = list(map(os.path.basename, thumb_files)) + if len(thumb_files) > 1: + raise FileExistsError(f"Too many thumb files ({thumb_names}) found in folder: {folder}") + thumb = thumb_files[0] if thumb_files else "" + return meta_files[0], thumb + + @staticmethod + def _valid_folder(folder: str, ext: str) -> Tuple[str, str, str]: + """Validate notebook folder if it has required meta file, python script or ipython notebook (depending on + the stage) and optional thumb. + + Args: + folder: path to the folder with python script, meta and artefacts + ext: extension determining the stage - ".py" for python script nad ".ipynb" for notebook + """ + files = glob.glob(os.path.join(folder, f"*{ext}")) + if len(files) != 1: + names = list(map(os.path.basename, files)) + raise FileNotFoundError(f"Missing required '{ext}' file in folder: {folder} among {names}") + meta_file, thumb_file = AssistantCLI._valid_conf_folder(folder) + return files[0], meta_file, thumb_file + + @staticmethod + def _valid_accelerator(folder: str) -> bool: + """Parse standard requirements from meta file. + + Args: + folder: path to the folder with python script, meta and artefacts + """ + meta = AssistantCLI._load_meta(folder) + meta_accels = [acc.lower() for acc in meta.get("accelerator", AssistantCLI._META_ACCEL_DEFAULT)] + device_accels = AssistantCLI.DEVICE_ACCELERATOR.lower().split(",") + return any(ac in meta_accels for ac in device_accels) + + @staticmethod + def _parse_requirements(folder: str) -> Tuple[str, str]: + """Parse standard requirements from meta file. + + Args: + folder: path to the folder with python script, meta and artefacts + """ + meta = AssistantCLI._load_meta(folder) + reqs = meta.get("requirements", []) + + meta_pip_args = { + k.replace(AssistantCLI._META_PIP_KEY, ""): v + for k, v in meta.items() + if k.startswith(AssistantCLI._META_PIP_KEY) + } + pip_args = ["--extra-index-url https://download.pytorch.org/whl/" + _RUNTIME_VERSIONS.get("DEVICE")] + for pip_key in meta_pip_args: + if not isinstance(meta_pip_args[pip_key], (list, tuple, set)): + meta_pip_args[pip_key] = [meta_pip_args[pip_key]] + for arg in meta_pip_args[pip_key]: + arg = arg % _RUNTIME_VERSIONS + pip_args.append(f"--{pip_key} {arg}") + + return " ".join([f'"{req}"' for req in reqs]), " ".join(pip_args) + + @staticmethod + def _bash_download_data(folder: str) -> List[str]: + """Generate sequence of commands for optional downloading dataset specified in the meta file. + + Args: + folder: path to the folder with python script, meta and artefacts + """ + meta = AssistantCLI._load_meta(folder) + datasets = meta.get("datasets", {}) + data_kaggle = datasets.get("kaggle", []) + cmd = [f"python -m kaggle competitions download -c {name}" for name in data_kaggle] + files = [f"{name}.zip" for name in data_kaggle] + data_web = datasets.get("web", []) + cmd += [f"wget {web} --progress=bar:force:noscroll --tries=3" for web in data_web] + files += [os.path.basename(web) for web in data_web] + for fn in files: + name, ext = os.path.splitext(fn) + if ext not in AssistantCLI._EXT_ARCHIVE: + continue + if ext in AssistantCLI._EXT_ARCHIVE_ZIP: + cmd += [f"unzip -o {fn} -d {AssistantCLI.DATASETS_FOLDER}/{name} {UNZIP_PROGRESS_BAR}"] + else: + cmd += [f"tar -zxvf {fn} --overwrite"] + cmd += [f"rm {fn}"] + cmd += [f"tree -L 2 {AssistantCLI.DATASETS_FOLDER}"] + return cmd + + @staticmethod + def bash_render(folder: str, output_file: str = PATH_SCRIPT_RENDER) -> Optional[str]: + """Prepare bash script for running rendering of a particular notebook. + + Args: + folder: name/path to a folder with notebook files + output_file: if defined, stream the commands to the file + + Returns: + string with nash script content + """ + cmd = list(AssistantCLI._BASH_SCRIPT_BASE) + [f"# Rendering: {folder}"] + if not AssistantCLI.DRY_RUN: + cmd += AssistantCLI._bash_download_data(folder) + ipynb_file, meta_file, thumb_file = AssistantCLI._valid_folder(folder, ext=".ipynb") + pub_ipynb = os.path.join(DIR_NOTEBOOKS, f"{folder}.ipynb") + pub_meta = pub_ipynb.replace(".ipynb", ".yaml") + pub_dir = os.path.dirname(pub_ipynb) + thumb_ext = os.path.splitext(thumb_file)[-1] if thumb_file else "." + pub_thumb = os.path.join(DIR_NOTEBOOKS, f"{folder}{thumb_ext}") if thumb_file else "" + cmd.append(f"mkdir -p {pub_dir}") + if AssistantCLI.DRY_RUN: + # dry run does not execute the notebooks just takes them as they are + cmd.append(f"cp {ipynb_file} {pub_ipynb}") + # copy and add meta config + cmd += [f"cp {meta_file} {pub_meta}", f"cat {pub_meta}", f"git add {pub_meta}"] + else: + pip_req, pip_args = AssistantCLI._parse_requirements(folder) + cmd += [f"pip install {pip_req} --quiet {pip_args}", "pip list"] + cmd.append(f"# available: {AssistantCLI.DEVICE_ACCELERATOR}\n") + if AssistantCLI._valid_accelerator(folder): + cmd.append(f"python -m papermill {ipynb_file} {pub_ipynb} --kernel python") + else: + warn("Invalid notebook's accelerator for this device. So no outputs will be generated.", RuntimeWarning) + cmd.append(f"cp {ipynb_file} {pub_ipynb}") + # Export the actual packages used in runtime + cmd.append(f"meta_file=$(python .actions/assistant.py update-env-details {folder})") + # copy and add to version the enriched meta config + cmd += ["echo $meta_file", "cat $meta_file", "git add $meta_file"] + # if thumb image is linked to the notebook, copy and version it too + if thumb_file: + cmd += [f"cp {thumb_file} {pub_thumb}", f"git add {pub_thumb}"] + # add the generated notebook to version + cmd.append(f"git add {pub_ipynb}") + if not output_file: + return os.linesep.join(cmd) + with open(output_file, "w") as fp: + fp.write(os.linesep.join(cmd)) + + @staticmethod + def bash_test(folder: str, output_file: str = PATH_SCRIPT_TEST) -> Optional[str]: + """Prepare bash script for running tests of a particular notebook. + + Args: + folder: name/path to a folder with notebook files + output_file: if defined, stream the commands to the file + + Returns: + string with nash script content + """ + cmd = list(AssistantCLI._BASH_SCRIPT_BASE) + [f"# Testing: {folder}"] + cmd += AssistantCLI._bash_download_data(folder) + ipynb_file, meta_file, _ = AssistantCLI._valid_folder(folder, ext=".ipynb") + + # prepare isolated environment with inheriting the global packages + path_venv = os.path.join(folder, "venv") + cmd += [ + f"python -m virtualenv --system-site-packages {path_venv}", + f"source {os.path.join(path_venv, 'bin', 'activate')}", + "pip --version", + ] + + cmd.append(f"# available: {AssistantCLI.DEVICE_ACCELERATOR}") + if AssistantCLI._valid_accelerator(folder): + # and install specific packages + pip_req, pip_args = AssistantCLI._parse_requirements(folder) + cmd += [f"pip install {pip_req} --quiet {pip_args}", "pip list"] + # Export the actual packages used in runtime + cmd.append(f"meta_file=$(python .actions/assistant.py update-env-details {folder} --base_path .)") + # show created meta config + cmd += ["echo $meta_file", "cat $meta_file"] + cmd.append(f"python -m pytest {ipynb_file} -v --nbval --nbval-cell-timeout=300") + else: + pub_ipynb = os.path.join(DIR_NOTEBOOKS, f"{folder}.ipynb") + pub_meta = pub_ipynb.replace(".ipynb", ".yaml") + # copy and add meta config + cmd += [ + f"mkdir -p {os.path.dirname(pub_meta)}", + f"cp {meta_file} {pub_meta}", + f"cat {pub_meta}", + f"git add {pub_meta}", + ] + warn("Invalid notebook's accelerator for this device. So no tests will be run!!!", RuntimeWarning) + # deactivate and clean local environment + cmd += ["deactivate", f"rm -rf {os.path.join(folder, 'venv')}"] + if not output_file: + return os.linesep.join(cmd) + with open(output_file, "w") as fp: + fp.write(os.linesep.join(cmd)) + + @staticmethod + def convert_ipynb(folder: str) -> None: + """Add template header and footer to the python base script. + + Args: + folder: folder with python script + """ + fpath, _, _ = AssistantCLI._valid_folder(folder, ext=".py") + with open(fpath) as fp: + py_script = fp.readlines() + + meta = AssistantCLI._load_meta(folder, strict=True) + meta.update( + dict(local_ipynb=f"{folder}.ipynb"), + generated=datetime.now().isoformat(), + ) + meta["description"] = meta["description"].replace(os.linesep, f"{os.linesep}# ") + + header = TEMPLATE_HEADER % meta + requires = set(load_requirements() + meta["requirements"]) + setup = TEMPLATE_SETUP % dict(requirements=" ".join([f'"{req}"' for req in requires])) + py_script = [header + setup] + py_script + [TEMPLATE_FOOTER] + + py_script = AssistantCLI._replace_images(py_script, folder) + + with open(fpath, "w") as fp: + fp.writelines(py_script) + + os.system(f'python -m jupytext --set-formats "ipynb,py:percent" {fpath}') + + @staticmethod + def _replace_images(lines: list, local_dir: str) -> list: + """Update images by URL to GitHub raw source. + + Args: + lines: string lines from python script + local_dir: relative path to the folder with script + """ + md = os.linesep.join([ln.rstrip() for ln in lines]) + p_imgs = [] + # todo: add a rule to replace this paths only i md sections + # because * is a greedy quantifier, trying to match as much as it can. Make it *? + p_imgs += re.findall(r"src=\"(.*?)\"", md) + p_imgs += re.findall(r"!\[.*?\]\((.*?)\)", md) + + # update all images + for p_img in set(p_imgs): + if p_img.startswith("http://") or p_img.startswith("https://"): + url_path = p_img + im = requests.get(p_img, stream=True).raw.read() + else: + url_path = "/".join([URL_PL_DOWNLOAD, local_dir, p_img]) + p_local_img = os.path.join(local_dir, p_img) + with open(p_local_img, "rb") as fp: + im = fp.read() + im_base64 = base64.b64encode(im).decode("utf-8") + _, ext = os.path.splitext(p_img) + md = md.replace(f'src="{p_img}"', f'src="{url_path}"') + md = md.replace(f"]({p_img})", f"](data:image/{ext[1:]};base64,{im_base64})") + + return [ln + os.linesep for ln in md.split(os.linesep)] + + @staticmethod + def _is_ipynb_parent_dir(dir_path: str) -> bool: + """Determine in recursive fasion of a folder is valid notebook file or any of sub-folders is.""" + if AssistantCLI._find_meta(dir_path): + return True + sub_dirs = [d for d in glob.glob(os.path.join(dir_path, "*")) if os.path.isdir(d)] + return any(AssistantCLI._is_ipynb_parent_dir(d) for d in sub_dirs) + + @staticmethod + def group_folders( + fpath_gitdiff: str, + fpath_change_folders: str = "changed-folders.txt", + fpath_drop_folders: str = "dropped-folders.txt", + fpath_actual_dirs: Sequence[str] = tuple(), + strict: bool = True, + root_path: str = "", + ) -> None: + """Parsing the raw git diff and group changes by folders. + + Args: + fpath_gitdiff: raw git changes + + Generate the git change list: + > head=$(git rev-parse origin/main) + > git diff --name-only $head --output=master-diff.txt + + fpath_change_folders: output file with changed folders + fpath_drop_folders: output file with deleted folders + fpath_actual_dirs: files with listed all folder in particular stat + strict: raise error if some folder outside skipped does not have valid meta file + root_path: path to the root tobe added for all local folder paths in files + + Example: + $ python assistant.py group-folders ../target-diff.txt \ + --fpath_actual_dirs "['../dirs-main.txt', '../dirs-publication.txt']" + """ + with open(fpath_gitdiff) as fp: + changed = [ln.strip() for ln in fp.readlines()] + dirs = [os.path.dirname(ln) for ln in changed] + # not empty paths + dirs = [ln for ln in dirs if ln] + + if fpath_actual_dirs: + assert isinstance(fpath_actual_dirs, list) + assert all(os.path.isfile(p) for p in fpath_actual_dirs) + dir_sets = [{ln.strip() for ln in open(fp).readlines()} for fp in fpath_actual_dirs] + # get only different + dirs += list(set.union(*dir_sets) - set.intersection(*dir_sets)) + + if root_path: + dirs = [os.path.join(root_path, d) for d in dirs] + # unique folders + dirs = set(dirs) + # drop folder with skip folder + dirs = [pd for pd in dirs if not any(nd in AssistantCLI._SKIP_DIRS for nd in pd.split(os.path.sep))] + # valid folder has meta + dirs_exist = [d for d in dirs if os.path.isdir(d)] + dirs_invalid = [d for d in dirs_exist if not AssistantCLI._find_meta(d)] + if strict and dirs_invalid: + msg = f"Following folders do not have valid `{AssistantCLI._META_FILE_REGEX}`" + warn(f"{msg}: \n {os.linesep.join(dirs_invalid)}") + # check if there is other valid folder in its tree + dirs_invalid = [pd for pd in dirs_invalid if not AssistantCLI._is_ipynb_parent_dir(pd)] + if dirs_invalid: + raise FileNotFoundError(f"{msg} nor sub-folder: \n {os.linesep.join(dirs_invalid)}") + + dirs_change = [d for d in dirs_exist if AssistantCLI._find_meta(d)] + with open(fpath_change_folders, "w") as fp: + fp.write(os.linesep.join(sorted(dirs_change))) + + dirs_drop = [d for d in dirs if not os.path.isdir(d)] + with open(fpath_drop_folders, "w") as fp: + fp.write(os.linesep.join(sorted(dirs_drop))) + + @staticmethod + def generate_matrix(fpath_change_folders: str) -> str: + """Generate Azure matrix with leaf for each changed notebook. + + Args: + fpath_change_folders: output of previous ``group_folders`` + """ + with open(fpath_change_folders) as fp: + folders = [ln.strip() for ln in fp.readlines()] + # set default so the matrix has at least one runner + if not folders: + return "" + mtx = {} + for ln in folders: + mtx[ln] = { + "notebook": ln, + # TODO: allow defining some custom pools with different devices + "agent-pool": AssistantCLI._AZURE_POOL, + # TODO: allow defining some custom images with with python or PT + "docker-image": AssistantCLI._AZURE_DOCKER, + } + return json.dumps(mtx) + + @staticmethod + def _get_card_item_cell(path_ipynb: str, path_meta: str, path_thumb: Optional[str]) -> Dict[str, Any]: + """Build the card item cell for the given notebook path.""" + meta = yaml.safe_load(open(path_meta)) + + # Clamp description length + wrapped_description = wrap( + meta.get("short_description", meta["description"]).strip().replace(os.linesep, " "), 175 + ) + suffix = "..." if len(wrapped_description) > 1 else "" + meta["short_description"] = wrapped_description[0] + suffix + + # Resolve some default tags based on accelerators and directory name + meta["tags"] = meta.get("tags", []) + + accelerators = meta.get("accelerator", ("CPU",)) + if ("GPU" in accelerators) or ("TPU" in accelerators): + meta["tags"].append("GPU/TPU") + + dirname = os.path.basename(os.path.dirname(path_ipynb)) + if dirname != ".notebooks": + meta["tags"].append(AssistantCLI._DIR_TO_TAG.get(dirname, dirname)) + + meta["tags"] = [tag.replace(" ", "-") for tag in meta["tags"]] + meta["tags"] = ",".join(meta["tags"]) + + # Build the notebook cell + rst_cell = TEMPLATE_CARD_ITEM % meta + + # Split lines + rst_cell_lines = rst_cell.strip().splitlines(True) + + if path_thumb is not None: + rst_cell_lines[-1] += "\n" + rst_cell_lines.append(f" :image: {path_thumb}") + + return { + "cell_type": "raw", + "metadata": {"raw_mimetype": "text/restructuredtext"}, + "source": rst_cell_lines, + } + + @staticmethod + def _resolve_path_thumb(path_ipynb: str, path_meta: str) -> Optional[str]: + """Find the thumbnail (assumes thumbnail to be any file that isn't metadata or notebook).""" + paths = list(set(glob.glob(path_ipynb.replace(".ipynb", ".*"))) - {path_ipynb, path_meta}) + if len(paths) == 0: + return None + assert len(paths) == 1, f"Found multiple possible thumbnail paths for notebook: {path_ipynb}." + path_thumb = paths[0] + path_thumb = path_thumb.split(os.path.sep) + path_thumb = os.path.sep.join(path_thumb[path_thumb.index(DIR_NOTEBOOKS) + 1 :]) + return path_thumb + + @staticmethod + def copy_notebooks( + path_root: str, + docs_root: str = "_docs/source", + path_docs_ipynb: str = "notebooks", + path_docs_images: str = "_static/images", + patterns: Sequence[str] = (".", "**"), + ) -> None: + """Copy all notebooks from a folder to doc folder. + + Args: + path_root: source path to the project root in these tutorials + docs_root: docs source directory + path_docs_ipynb: destination path to the notebooks' location relative to ``docs_root`` + path_docs_images: destination path to the images' location relative to ``docs_root`` + patterns: patterns to use when glob-ing notebooks + """ + ls_ipynb = [] + for sub in patterns: + ls_ipynb += glob.glob(os.path.join(path_root, DIR_NOTEBOOKS, sub, "*.ipynb")) + + os.makedirs(os.path.join(docs_root, path_docs_ipynb), exist_ok=True) + ipynb_content = [] + for path_ipynb in tqdm.tqdm(ls_ipynb): + ipynb = path_ipynb.split(os.path.sep) + sub_ipynb = os.path.sep.join(ipynb[ipynb.index(DIR_NOTEBOOKS) + 1 :]) + new_ipynb = os.path.join(docs_root, path_docs_ipynb, sub_ipynb) + os.makedirs(os.path.dirname(new_ipynb), exist_ok=True) + + path_meta = path_ipynb.replace(".ipynb", ".yaml") + path_thumb = AssistantCLI._resolve_path_thumb(path_ipynb, path_meta) + + if path_thumb is not None: + new_thumb = os.path.join(docs_root, path_docs_images, path_thumb) + old_path_thumb = os.path.join(path_root, DIR_NOTEBOOKS, path_thumb) + os.makedirs(os.path.dirname(new_thumb), exist_ok=True) + copyfile(old_path_thumb, new_thumb) + path_thumb = os.path.join(path_docs_images, path_thumb) + + print(f"{path_ipynb} -> {new_ipynb}") + + with open(path_ipynb) as f: + ipynb = json.load(f) + + ipynb["cells"].append(AssistantCLI._get_card_item_cell(path_ipynb, path_meta, path_thumb)) + + with open(new_ipynb, "w") as f: + json.dump(ipynb, f) + + ipynb_content.append(os.path.join("notebooks", sub_ipynb)) + + @staticmethod + def update_env_details(folder: str, base_path: str = DIR_NOTEBOOKS) -> str: + """Export the actual packages used in runtime. + + Args: + folder: path to the folder + base_path: + """ + meta = AssistantCLI._load_meta(folder) + # default is COU runtime + with open(PATH_REQ_DEFAULT) as fp: + req = fp.readlines() + req += meta.get("requirements", []) + req = [r.strip() for r in req] + + def _parse_package_name(pkg: str, keys: str = " !<=>[]@", egg_name: str = "#egg=") -> str: + """Parsing just the package name.""" + if egg_name in pkg: + pkg = pkg[pkg.index(egg_name) + len(egg_name) :] + if any(c in pkg for c in keys): + ix = min(pkg.index(c) for c in keys if c in pkg) + pkg = pkg[:ix] + return pkg + + require = {_parse_package_name(r) for r in req if r} + env = {_parse_package_name(p): p for p in freeze.freeze()} + meta["environment"] = [env[r] for r in require] + meta["published"] = datetime.now().isoformat() + + fmeta = os.path.join(base_path, folder) + ".yaml" + yaml.safe_dump(meta, stream=open(fmeta, "w"), sort_keys=False) + return fmeta + + @staticmethod + def list_dirs(folder: str = "", include_file_ext: str = "") -> str: + """List all sub-folders in a given tree including any ipynb.""" + dirs = glob.glob(os.path.join(folder, "*" + include_file_ext)) + dirs += glob.glob(os.path.join(folder, "**", "*" + include_file_ext)) + if include_file_ext: + _ignore_base_dir = lambda p: os.path.sep.join(p.split(os.path.sep)[1:]) # noqa: E731 + # Take the notebook as a folder (notebook are on teh same level as the raw tutorial file mix) + dirs = [os.path.splitext(_ignore_base_dir(p))[0] for p in dirs] + else: + dirs = [p for p in dirs if os.path.isdir(p)] + return os.linesep.join(sorted(dirs)) + + +if __name__ == "__main__": + fire.Fire(AssistantCLI) diff --git a/.actions/git-diff-sync.sh b/.actions/git-diff-sync.sh new file mode 100644 index 0000000..c461e6d --- /dev/null +++ b/.actions/git-diff-sync.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +printf "Detect changes for: $1 >> $2\n\n" + +b1="${1//'/'/'_'}" +printf "Branch alias: $b1\n" +# list all dirs in source branch +python .actions/assistant.py list_dirs > "dirs-$b1.txt" +cat "dirs-$b1.txt" + +head=$(git rev-parse origin/$2) +git diff --name-only $head --output=target-diff.txt +printf "\nRaw changes:\n" +cat target-diff.txt +# transfer the source CLI version +mkdir -p _TEMP +cp -r .actions/ _TEMP/.actions/ + +git checkout $2 +b2="${2//'/'/'_'}" +printf "Branch alias: $b2\n" +# recover the original CLI +#rm -rf .actions && mv _TEMP/.actions .actions +# list all dirs in target branch +python _TEMP/.actions/assistant.py list_dirs ".notebooks" --include_file_ext=".ipynb" > "dirs-$b2.txt" +cat "dirs-$b2.txt" + +printf "\n\n" +git merge --ff -s resolve origin/$1 + +python _TEMP/.actions/assistant.py group-folders target-diff.txt --fpath_actual_dirs "['dirs-$b1.txt', 'dirs-$b2.txt']" +printf "\n\nChanged folders:\n" +cat changed-folders.txt +printf "\n\nDropped folders:\n" +cat dropped-folders.txt +printf "\n" diff --git a/.actions/requires.txt b/.actions/requires.txt new file mode 100644 index 0000000..ff35f70 --- /dev/null +++ b/.actions/requires.txt @@ -0,0 +1,6 @@ +Fire +tqdm +PyYAML +wcmatch +requests +pip diff --git a/.actions/test_cli.py b/.actions/test_cli.py new file mode 100644 index 0000000..b73d686 --- /dev/null +++ b/.actions/test_cli.py @@ -0,0 +1,30 @@ +import os + +import pytest +from assistant import AssistantCLI + +_PATH_ROOT = os.path.dirname(os.path.dirname(__file__)) +_PATH_TEMPLATES = os.path.join(_PATH_ROOT, "templates") +_PATH_DIR_SIMPLE = os.path.join(_PATH_TEMPLATES, "simple") +_PATH_DIR_TITANIC = os.path.join(_PATH_TEMPLATES, "titanic") + + +def _path_in_dir(fname: str, folder: str = _PATH_ROOT) -> str: + return os.path.join(folder, fname) + + +@pytest.mark.parametrize( + "cmd,args", + [ + ("list_dirs", []), + ("list_dirs", [".", ".ipynb"]), + ("bash_render", [_PATH_DIR_SIMPLE]), + ("bash_test", [_PATH_DIR_SIMPLE]), + ("group_folders", [_path_in_dir("master-diff.txt"), _path_in_dir("dirs-b1.txt"), _path_in_dir("dirs-b2.txt")]), + ("convert_ipynb", [_PATH_DIR_SIMPLE]), + ("copy_notebooks", [_PATH_ROOT]), + ("update_env_details", [_PATH_DIR_SIMPLE]), + ], +) +def test_assistant_commands(cmd: str, args: list): + AssistantCLI().__getattribute__(cmd)(*args) diff --git a/.azure/ipynb-publish.yml b/.azure/ipynb-publish.yml new file mode 100644 index 0000000..4e99d21 --- /dev/null +++ b/.azure/ipynb-publish.yml @@ -0,0 +1,172 @@ +trigger: + # this shall process all these workflows in sequence even several PRs are merged shortly + batch: "true" + # publish notebooks only from default/main branch + branches: + include: [ main ] + +# no run on PR as this is exclusive for publishing notebooks +pr: none + +jobs: + + - job: sync_pub + pool: + vmImage: 'Ubuntu-20.04' + variables: + ACCELERATOR: CPU,GPU + PUB_BRANCH: publication + COMMIT_MSG: $(echo "$(Build.SourceVersionMessage)" | head -n 1) + COMMIT_HASH: "$(Build.SourceVersion)" + steps: + - bash: | + git config --global user.email "pipelines@azure.com" + git config --global user.name "Azure Pipelines" + printf "commit hash:\n $(COMMIT_HASH)\n" + printf "commit message:\n $(COMMIT_MSG)\n" + displayName: 'Set Git user' + - bash: | + set -e + git fetch --all + echo $(PUB_BRANCH) + git ls-remote --heads origin ${PUB_BRANCH} | grep ${PUB_BRANCH} >/dev/null + if [ "$?" == "1" ] ; then echo "Branch doesn't exist"; exit; fi + displayName: 'Git branch check' + + - bash: pip install -r .actions/requires.txt + displayName: 'Install dependencies' + - bash: | + current_branch=$(cut -d '/' -f3- <<< $(Build.SourceBranch)) + printf "$current_branch\n" + bash .actions/git-diff-sync.sh $current_branch $(PUB_BRANCH) + displayName: 'Compare changes & sync' + + - bash: | + notebooks=$(python .actions/assistant.py generate-matrix changed-folders.txt) + printf "Changed notebooks: $notebooks\n" + echo "##vso[task.setVariable variable=dirs;isOutput=true]$notebooks" + name: mtrx + displayName: 'Changed matrix' + + - bash: | + # remove notebooks which have moved + while IFS= read -r line; do + git rm .notebooks/$line.ipynb + git rm .notebooks/$line.yaml + done <<< $(cat dropped-folders.txt) + git status + git commit -m "prune: $(COMMIT_HASH)" + condition: gt(variables['dropped.folders'], 0) + displayName: 'Prune notebook' + + - bash: | + git status + git push https://$(PAT_GHOST)@github.com/Lightning-AI/tutorials.git $(PUB_BRANCH) + displayName: 'Finish push' + + - job: papermill + dependsOn: sync_pub + strategy: + # generated matrix with changed notebooks, include fields: "notebook", "agent-pool" and "docker-image" + matrix: $[ dependencies.sync_pub.outputs['mtrx.dirs'] ] + # Maximum number of jobs running in parallel, use 1 to run in sequence and reduce collisions + maxParallel: "1" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + # how long to run the job before automatically cancelling + # When 0 is specified, the maximum limit is used: + # - For 360 minutes (6 hours) on Microsoft-hosted agents with a public project and public repository + # - For 60 minutes on Microsoft-hosted agents with a private project or private repository + timeoutInMinutes: "180" + + pool: "$(agent-pool)" + # this need to have installed docker in the base machine/image... + container: + image: "$(docker-image)" + options: "--gpus=all --shm-size=32g -v /usr/bin/docker:/tmp/docker:ro" + + variables: + ACCELERATOR: CPU,GPU + PUB_BRANCH: publication + PATH_DATASETS: "$(Build.Repository.LocalPath)/.datasets" + COMMIT_MSG: $(echo "$(Build.SourceVersionMessage)" | head -n 1) + COMMIT_HASH: "$(Build.SourceVersion)" + DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) + + condition: ne(dependencies.sync_pub.outputs['mtrx.dirs'], '') + + steps: + - bash: | + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + echo "##vso[task.setvariable variable=CONTAINER_ID]$(head -1 /proc/self/cgroup|cut -d/ -f3)" + displayName: 'Set environment variables' + + - bash: | + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + echo $CUDA_VISIBLE_DEVICES + echo $CONTAINER_ID + python --version + pip list + displayName: 'Image info & NVIDIA' + + - script: | + /tmp/docker exec -t -u 0 $CONTAINER_ID \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + + - bash: | + git config --global user.email "pipelines@azure.com" + git config --global user.name "Azure Pipelines" + printf "commit hash:\n $(COMMIT_HASH)\n" + printf "commit message:\n $(COMMIT_MSG)\n" + displayName: 'Set Git user' + - bash: | + set -e + git fetch --all + echo $(PUB_BRANCH) + git ls-remote --heads origin ${PUB_BRANCH} | grep ${PUB_BRANCH} >/dev/null + if [ "$?" == "1" ] ; then echo "Branch doesn't exist"; exit; fi + git checkout $(PUB_BRANCH) + git show-ref $(PUB_BRANCH) + git pull + displayName: 'Git check & switch branch' + + - bash: | + set -e + sudo apt-get update -q --fix-missing + sudo apt install -y tree ffmpeg + #pip install --upgrade pip + #pip --version + pip install -r requirements.txt -r _requirements/data.txt + pip list + displayName: 'Install dependencies' + + - bash: | + set -e + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu > 0, f'GPU: {mgpu}'" + python -m papermill --version + displayName: 'Sanity check' + + - bash: python .actions/assistant.py convert-ipynb $(notebook) + displayName: 'Generate notebook' + + - bash: | + set -e + mkdir $(PATH_DATASETS) + python .actions/assistant.py bash-render $(notebook) + cat .actions/_ipynb-render.sh + bash .actions/_ipynb-render.sh + git status + git commit -m "publish [GPU]: $(notebook)" + env: + KAGGLE_USERNAME: $(KAGGLE_USERNAME) + KAGGLE_KEY: $(KAGGLE_KEY) + displayName: 'Render notebook' + + - bash: | + git status + git show-ref $(PUB_BRANCH) + git push https://$(PAT_GHOST)@github.com/Lightning-AI/tutorials.git $(PUB_BRANCH) + displayName: 'Finish push' diff --git a/.azure/ipynb-tests.yml b/.azure/ipynb-tests.yml new file mode 100644 index 0000000..f7c9d24 --- /dev/null +++ b/.azure/ipynb-tests.yml @@ -0,0 +1,106 @@ +trigger: none +pr: + branches: + include: [ main ] + autoCancel: "true" + drafts: "true" + +# Multi-job configuration +# - https://learn.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#multi-job-configuration + +jobs: + + - job: check_diff + pool: + vmImage: 'Ubuntu-20.04' + steps: + - bash: | + pip install -r .actions/requires.txt + pip list + displayName: 'Install dependencies' + + - bash: | + head=$(git rev-parse origin/main) + printf "Head: $head\n" + git diff --name-only $head --output=target-diff.txt + python .actions/assistant.py group-folders --fpath_gitdiff=target-diff.txt + printf "Changed folders:\n" + cat changed-folders.txt + displayName: 'Process folders' + + - bash: | + notebooks=$(python .actions/assistant.py generate-matrix changed-folders.txt) + printf "Changed notebooks: $notebooks\n" + echo "##vso[task.setVariable variable=dirs;isOutput=true]$notebooks" + name: mtrx + displayName: 'Changed matrix' + + - job: nbval + dependsOn: check_diff + strategy: + matrix: $[ dependencies.check_diff.outputs['mtrx.dirs'] ] + # how long to run the job before automatically cancelling + timeoutInMinutes: "95" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + + pool: "$(agent-pool)" + # this need to have installed docker in the base image... + container: + image: "$(docker-image)" + options: "--gpus=all --shm-size=32g -v /usr/bin/docker:/tmp/docker:ro" + + variables: + ACCELERATOR: CPU,GPU + PATH_DATASETS: "$(Build.Repository.LocalPath)/.datasets" + DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) + + condition: ne(dependencies.check_diff.outputs['mtrx.dirs'], '') + + steps: + + - bash: | + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + echo "##vso[task.setvariable variable=CONTAINER_ID]$(head -1 /proc/self/cgroup|cut -d/ -f3)" + displayName: 'Set environment variables' + + - bash: | + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + echo $CUDA_VISIBLE_DEVICES + echo $CONTAINER_ID + python --version + pip list | grep torch + displayName: 'Image info & NVIDIA' + + - script: | + /tmp/docker exec -t -u 0 $CONTAINER_ID \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + displayName: 'Install Sudo in container (thanks Microsoft!)' + + - bash: | + set -e + sudo apt-get update -q --fix-missing + sudo apt install -y tree ffmpeg + pip install -r requirements.txt -r _requirements/data.txt + pip list + displayName: 'Install dependencies' + + - bash: | + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu > 0, f'GPU: {mgpu}'" + displayName: 'Sanity check' + + - bash: python .actions/assistant.py convert-ipynb $(notebook) + displayName: 'Generate notebook' + + - bash: | + set -e + mkdir $(PATH_DATASETS) + python .actions/assistant.py bash-test $(notebook) + cat .actions/_ipynb-test.sh + bash .actions/_ipynb-test.sh + env: + KAGGLE_USERNAME: $(KAGGLE_USERNAME) + KAGGLE_KEY: $(KAGGLE_KEY) + displayName: 'PyTest notebook' diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..7196116 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,52 @@ +# see https://docs.codecov.io/docs/codecov-yaml +# Validation check: +# $ curl --data-binary @.codecov.yml https://codecov.io/validate + + +# https://docs.codecov.io/docs/codecovyml-reference +codecov: + bot: "codecov-io" + strict_yaml_branch: "yaml-config" + require_ci_to_pass: yes + notify: + # after_n_builds: 2 + wait_for_ci: yes + +coverage: + precision: 0 # 2 = xx.xx%, 0 = xx% + round: nearest # how coverage is rounded: down/up/nearest + range: 40...100 # custom range of coverage colors from red -> yellow -> green + status: + # https://codecov.readme.io/v1.0/docs/commit-status + project: + default: + target: 95% # specify the target coverage for each commit status + threshold: 30% # allow this little decrease on project + # https://github.com/codecov/support/wiki/Filtering-Branches + # branches: master + if_ci_failed: error + # https://github.com/codecov/support/wiki/Patch-Status + patch: + default: + threshold: 50% # allow this much decrease on patch + changes: false + +# https://docs.codecov.com/docs/github-checks#disabling-github-checks-patch-annotations +github_checks: + annotations: false + +parsers: + gcov: + branch_detection: + conditional: true + loop: true + macro: false + method: false + javascript: + enable_partials: false + +comment: + layout: header, diff + require_changes: false + behavior: default # update if exists else create new + # branches: * diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..180a922 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.ipynb filter=lfs diff=lfs merge=lfs -text diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..24a71a1 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,24 @@ +# This is a comment. +# Each line is a file pattern followed by one or more owners. + +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence, +# @global-owner1 and @global-owner2 will be requested for +# review when someone opens a pull request. +* @borda @rohitgr7 @carmocca @kaushikb11 @SeanNaren @ethanwharris + +# CI/CD and configs +/.actions/ @borda @ethanwharris +/.azure-*/ @borda @ethanwharris +/.github/ @borda @ethanwharris +/_requirements/ @borda @ethanwharris +*.yml @borda @ethanwharris +requirements.txt @borda @ethanwharris + +# Docs +/_docs/ @borda @ethanwharris @rohitgr7 +/.github/*.md @borda @ethanwharris @rohitgr7 +/.github/ISSUE_TEMPLATE/ @borda @ethanwharris @rohitgr7 + +/.github/CODEOWNERS @borda +/README.md @borda diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..b2510fc --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,81 @@ +# Contributing + +Welcome to the PyTorch Lightning community! We're building the most advanced research platform on the planet to implement the latest, best practices that the amazing PyTorch team rolls out! + +## Design Principles + +We encourage all sorts of contributions you're interested in adding! When writing Tutorials, please follow these principles. + +#### Simple Internal Code + +It's useful for users to look at the code and understand very quickly what's happening. +Many users won't be engineers. Thus we need to value clear, simple code over condensed ninja moves. +While that's super cool, this isn't the project for that :) + +#### Force User Decisions To Best Practices + +There are 1,000 ways to do something. However, eventually one popular solution becomes standard practice, and everyone follows. +We try to find the best way to solve a particular problem, and then force our users to use it for readability and simplicity. + +When something becomes a best practice, we add it to the framework. This is usually something like bits of code in utils or in the model file that everyone keeps adding over and over again across projects. When this happens, bring that code inside the trainer and add a flag for it. + +#### Gain User Trust + +As a researcher, you can't have any part of your code going wrong. So, make thorough tests to ensure that every implementation of a new trick or subtle change is correct. + +#### Interoperability + +PyTorch Lightning Tutorials is highly interoperable with PyTorch Lightning and PyTorch. + +______________________________________________________________________ + +## Contribution Types + +We are always looking for help to implement new features or fixing bugs. + +A lot of good work has already been done in project mechanics (\_requirements/base.txt, setup.py, pep8, badges, ci, etc...) so we're in a good state there thanks to all sooner contributors! + +### Bug Fixes: + +1. If you find a bug please submit a GitHub issue. Make sure the title explains the issue. +1. Try to fix it or recommend a solution. +1. Submit a PR! + +_**Note**, even if you do not find the solution, sending a PR with a test covering the issue is a valid contribution, and we can help you or finish it with you :\]_ + +### New Models: + +PyTorch Lightning Tutorials shows several research models for ready usage. Following are general guidelines for adding new models. + +1. Workflows which are standard baselines +1. Whose results are reproduced properly either by us or by authors. +1. Do not reinvent the wheel, natively support torchvision, torchtext, torchaudio models. +1. Use open source licensed models. + +Please raise an issue before adding a new tutorial. There are tons of models that keep coming. It is very difficult to support every peace. + +______________________________________________________________________ + +## Guidelines + +For this section, we refer to read the [parent PL guidelines](https://pytorch-lightning.readthedocs.io/en/stable/CONTRIBUTING.html) + +**Reminder** + +All added or edited code shall be the own original work of the particular contributor. +If you use some third-party implementation, all such blocks/functions/modules shall be properly referred and if possible also agreed by code's author. For example - `This code is inspired from http://...`. +In case you adding new dependencies, make sure that they are compatible with the actual PyTorch Lightning license (each particular tutorial can have own licence). + +### Question & Answer + +1. **How can I help/contribute?** + + All help is extremely welcome - reporting bugs, fixing documentation, etc. To solve some issues you can start with label [good first issue](https://github.com/Lightning-AI/lightning-bolts/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) or chose something close to your domain with label. Before you start to implement anything check that the issue description that it is clear and self-assign the task to you (if it is not possible, just comment that you take it, and we assign it to you...). + +1. **Is there a recommendation for branch names?** + + We do not rely on the name convention so far you are working with your own fork. Anyway it would be nice to follow this convention `/_` where the types are: `bugfix`, `ipynb`, `docs`, ... + +1. **I have a model in other framework than PyTorch, how do I add it here?** + + Since PL Tutorials are aiming at Pytorch Lightning implementations we encourage staying with it. diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..fd7a397 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,28 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug / fix, help wanted +assignees: '' +--- + +## 🐛 Bug + + + +### To Reproduce + +Steps to reproduce the behavior: + +1. Run '....' +1. See error + + + +### Expected behavior + + + +### Additional context + + diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..c52bf19 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Ask a Question + url: https://github.com/Lightning-AI/tutorials/discussions/new + about: Ask and answer Lightning related questions + - name: 💬 Slack + url: https://app.slack.com/client/TR9DVT48M/CQXV8BRH9/thread/CQXV8BRH9-1591382895.254600 + about: Chat with our community diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..11378b2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,25 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' +--- + +## 🚀 Feature + +### Motivation + + + +### Pitch + + + +### Alternatives + + + +### Additional context + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..429d86a --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,18 @@ +## Before submitting + +- [ ] Was this **discussed/approved** via a Github issue? (no need for typos and docs improvements) +- [ ] Did you make sure to **update the docs**? +- [ ] Did you write any new **necessary tests**? + +## What does this PR do? + +Fixes # (issue) + +## PR review + +Anyone in the community is free to review the PR once the tests have passed. +If we didn't discuss your PR in Github issues there's a high chance it will not be merged. + +## Did you have fun? + +Make sure you had fun coding 🙃 diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..a01cfc7 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,30 @@ +# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file +version: 2 +updates: + # Enable version updates for python + - package-ecosystem: "pip" + # Look for a `requirements` in the `root` directory + directory: "/_requirements" + # Check for updates once a week + schedule: + interval: "monthly" + # Labels on pull requests for version updates only + labels: ["ci/cd"] + pull-request-branch-name: + # Separate sections of the branch name with a hyphen + separator: "-" + # Allow up to 5 open pull requests for pip dependencies + open-pull-requests-limit: 5 + reviewers: + - "Lightning-AI/teams/core-lightning" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + labels: ["ci/cd"] + pull-request-branch-name: + separator: "-" + open-pull-requests-limit: 5 + reviewers: + - "Lightning-AI/core-lightning" diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000..621b7e0 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,7 @@ +documentation: + - _docs/**/* + +CI/CD: + - .actions/**/* + - .azure-*/**/* + - .github/**/* diff --git a/.github/mergify.yml b/.github/mergify.yml new file mode 100644 index 0000000..06f25ec --- /dev/null +++ b/.github/mergify.yml @@ -0,0 +1,32 @@ +pull_request_rules: + + - name: warn on conflicts + conditions: + - conflict + - -draft # filter-out GH draft PRs + - -label="has conflicts" + actions: + # comment: + # message: This pull request is now in conflict... :( + label: + add: [ "has conflicts" ] + + - name: resolved conflicts + conditions: + - -conflict + - label="has conflicts" + - -draft # filter-out GH draft PRs + - -merged # not merged yet + - -closed + actions: + label: + remove: [ "has conflicts" ] + + - name: add core reviewer + conditions: + # number of review approvals + - "#approved-reviews-by<2" + actions: + request_reviews: + users: + - Borda diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 0000000..8dd7aca --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,26 @@ +# https://github.com/marketplace/stale + +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 60 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 14 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security +# Label to use when marking an issue as stale +staleLabel: won't fix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false + +# Set to true to ignore issues in a project (defaults to false) +exemptProjects: true +# Set to true to ignore issues in a milestone (defaults to false) +exemptMilestones: true +# Set to true to ignore issues with an assignee (defaults to false) +exemptAssignees: true diff --git a/.github/workflows/ci_block-ipybn.yml b/.github/workflows/ci_block-ipybn.yml new file mode 100644 index 0000000..cde61ec --- /dev/null +++ b/.github/workflows/ci_block-ipybn.yml @@ -0,0 +1,13 @@ +name: Prevent adding/chnaging notebooks + +# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows +on: # Trigger the workflow on PR to master + pull_request: + paths: + - ./**/*.ipynb + +jobs: + block-ipynb: + runs-on: ubuntu-latest + steps: + - run: exit 1 diff --git a/.github/workflows/ci_block-pub.yml b/.github/workflows/ci_block-pub.yml new file mode 100644 index 0000000..e0a2e81 --- /dev/null +++ b/.github/workflows/ci_block-pub.yml @@ -0,0 +1,12 @@ +name: Prevent modify publication + +# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows +on: # Trigger the workflow on PR to master + pull_request: + branches: [publication] + +jobs: + block-ipynb: + runs-on: ubuntu-latest + steps: + - run: exit 1 diff --git a/.github/workflows/ci_checks.yml b/.github/workflows/ci_checks.yml new file mode 100644 index 0000000..408da93 --- /dev/null +++ b/.github/workflows/ci_checks.yml @@ -0,0 +1,18 @@ +name: General checks + +on: + push: + branches: + - "*" + - "**" + - "!publication" + pull_request: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} + cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} + +jobs: + + check-schema: + uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.8.0 diff --git a/.github/workflows/ci_docs.yml b/.github/workflows/ci_docs.yml new file mode 100644 index 0000000..7902d98 --- /dev/null +++ b/.github/workflows/ci_docs.yml @@ -0,0 +1,116 @@ +name: validate Docs + +on: # Trigger the workflow on push or pull request +# push: +# branches: [main] + pull_request: {} + schedule: + # At the end of every day + - cron: "0 0 * * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + build-docs: + runs-on: ubuntu-latest + env: + PUB_BRANCH: publication + PATH_DATASETS: ${{ github.workspace }}/.datasets + timeout-minutes: 20 + steps: + - name: Checkout 🛎️ + uses: actions/checkout@v3 + with: + fetch-depth: 0 # fetch all history for all branches and tags + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + + - name: Cache pip + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: pip-${{ hashFiles('requirements.txt') }}-${{ hashFiles('_requirements/docs.txt') }} + restore-keys: pip- + + - name: Install dependencies + run: | + sudo apt-get update --fix-missing + sudo apt-get install -y tree + # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux + sudo apt-get install -y cmake pandoc texlive-latex-extra dvipng texlive-pictures + pip --version + pip install -q -r requirements.txt -r _requirements/docs.txt + pip list + shell: bash + + - name: Process folders + run: | + mkdir -p ${PATH_DATASETS} + head=$(git rev-parse origin/"${{ github.base_ref }}") + git diff --name-only $head --output=master-diff.txt + python .actions/assistant.py group-folders master-diff.txt + printf "Changed folders:\n" + cat changed-folders.txt + shell: bash + + - name: ">> output" + id: changed + run: python -c "lines = open('changed-folders.txt').readlines(); print(f'::set-output name=nb_dirs::{len(lines)}')" + + - uses: oleksiyrudenko/gha-git-credentials@v2.1 + with: + token: '${{ secrets.GITHUB_TOKEN }}' + global: true + - name: Sync to pub + run: git merge -s resolve origin/$PUB_BRANCH + + - name: Generate notebooks + if: steps.changed.outputs.nb_dirs != 0 + run: | + while IFS= read -r line; do + python .actions/assistant.py convert-ipynb $line + python .actions/assistant.py bash-render $line + cat .actions/_ipynb-render.sh + bash .actions/_ipynb-render.sh + done <<< $(cat changed-folders.txt) + env: + DRY_RUN: 1 + shell: bash + + - name: Copy notebooks + if: steps.changed.outputs.nb_dirs != 0 + run: | + while IFS= read -r line; do + dir=$(dirname $line) + mkdir -p changed-notebooks/${dir} + cp .notebooks/${line}.ipynb changed-notebooks/${dir}/ + done <<< $(cat changed-folders.txt) + tree changed-notebooks + shell: bash + + - uses: actions/upload-artifact@v3 + if: steps.changed.outputs.nb_dirs != 0 + with: + name: notebooks-${{ github.sha }} + path: changed-notebooks/ + + - name: Make Documentation + working-directory: ./_docs + run: make html --debug SPHINXOPTS="-W --keep-going" + + - name: Check External Links (Optional) + working-directory: ./_docs + run: make --jobs $(nproc) linkcheck + # ToDO: comment on PR if any link failed + continue-on-error: true + + - name: Upload built docs + uses: actions/upload-artifact@v3 + with: + name: docs-html-${{ github.sha }} + path: _docs/build/html/ + # Use always() to always run this step to publish test results when there are test failures + if: success() diff --git a/.github/workflows/ci_test-acts.yml b/.github/workflows/ci_test-acts.yml new file mode 100644 index 0000000..205e268 --- /dev/null +++ b/.github/workflows/ci_test-acts.yml @@ -0,0 +1,83 @@ +name: CI internal + +# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows +on: # Trigger the workflow on push or pull request, but only for the main branch + push: {} + pull_request: + branches: [main] + +defaults: + run: + shell: bash + +jobs: + pytest-internal: + + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-20.04 ] + python-version: ["3.8", "3.10"] + # Timeout: https://stackoverflow.com/a/59076067/4521646 + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 # fetch all history for all branches and tags + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + # Note: This uses an internal pip API and may not always work + # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow + - name: Get pip cache dir + id: pip-cache + run: echo "::set-output name=dir::$(pip cache dir)" + + - name: pip cache + uses: actions/cache@v3 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ hashFiles('.actions/requires.txt') }}-${{ hashFiles('requirements/default.txt') }} + restore-keys: ${{ runner.os }}-pip-py${{ matrix.python-version }}- + + - name: Install requirements + run: | + pip --version + pip install -q -r .actions/requires.txt -r _requirements/test.txt + # this is needed to be able to run package version parsing test + pip install -q -r _requirements/default.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + + - name: Prepare dummy inputs + run: | + jupytext --set-formats ipynb,py:percent templates/simple/template.py + jupytext --set-formats ipynb,py:percent templates/titanic/tutorial.py + # mv templates/simple/template.ipynb templates/simple.ipynb + git diff --name-only HEAD~3 > master-diff.txt + python .actions/assistant.py list_dirs "" > dirs-b1.txt + python .actions/assistant.py list_dirs --include_file_ext=".ipynb" > dirs-b2.txt + + - name: Testing + run: | + coverage run -m pytest .actions -v + + - name: Statistics + if: success() + run: | + coverage report + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + if: always() + # see: https://github.com/actions/toolkit/issues/399 + continue-on-error: true + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: coverage.xml + flags: pytest,${{ runner.os }} + name: CLI-coverage + fail_ci_if_error: false diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml new file mode 100644 index 0000000..0547d8b --- /dev/null +++ b/.github/workflows/docs-deploy.yml @@ -0,0 +1,58 @@ +name: Deploy Docs +on: + push: + branches: [publication] + workflow_dispatch: {} + workflow_run: + workflows: ["Publish notebook"] + types: + - completed + +jobs: + # https://github.com/marketplace/actions/deploy-to-github-pages + build-docs-deploy: + runs-on: ubuntu-20.04 + env: + PATH_DATASETS: ${{ github.workspace }}/.datasets + steps: + - name: Checkout 🛎️ + uses: actions/checkout@v3 + with: + ref: publication + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + + - name: Cache pip + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}-${{ hashFiles('_requirements/docs.txt') }} + restore-keys: ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + mkdir -p ${PATH_DATASETS} + # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux + sudo apt-get update + sudo apt-get install -y cmake pandoc + sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures + pip --version + pip install --quiet --requirement _requirements/docs.txt + pip list + shell: bash + + - name: Make Documentation + working-directory: ./_docs + run: make html --jobs $(nproc) + + - name: Deploy 🚀 + uses: JamesIves/github-pages-deploy-action@v4.4.2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + branch: gh-pages # The branch the action should deploy to. + folder: _docs/build/html # The folder the action should deploy. + clean: true # Automatically remove deleted files from the deploy branch + target-folder: docs # If you'd like to push the contents of the deployment folder into a specific directory + single-commit: true # you'd prefer to have a single commit on the deployment branch instead of full history + if: success() diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..df8737c --- /dev/null +++ b/.gitignore @@ -0,0 +1,137 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +_docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea/ + +# data artifacts +logs/ +lightning_logs/ +cifar-10-batches-py +*.tar.gz diff --git a/.jupytext.yml b/.jupytext.yml new file mode 100644 index 0000000..fa534dd --- /dev/null +++ b/.jupytext.yml @@ -0,0 +1,4 @@ +# todo: this seems to have no effect atm + +# Always pair ipynb notebooks to py:percent files +formats: ipynb,py:percent diff --git a/.notebooks/README.md b/.notebooks/README.md new file mode 100644 index 0000000..3c73175 --- /dev/null +++ b/.notebooks/README.md @@ -0,0 +1,10 @@ +shadow folder for generated notebooks, no uploading here + +## reBuild all notebooks + +```bash +git checkout -b publication main +make ipynb +git commit -m "regenerate all notebooks" +git push +``` diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..678b601 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,67 @@ +default_language_version: + python: python3.8 + +ci: + autofix_prs: true + autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' + autoupdate_schedule: quarterly + # submodules: true + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-case-conflict + - id: check-json + - id: check-yaml + - id: check-toml + - id: check-added-large-files + args: ['--maxkb=250', '--enforce-all'] + - id: check-docstring-first + - id: detect-private-key + + - repo: https://github.com/asottile/pyupgrade + rev: v3.8.0 + hooks: + - id: pyupgrade + args: [--py37-plus] + name: Upgrade code + + - repo: https://github.com/PyCQA/docformatter + rev: v1.7.3 + hooks: + - id: docformatter + args: [--in-place, --wrap-summaries=115, --wrap-descriptions=120] + + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + name: Format code + + - repo: https://github.com/asottile/yesqa + rev: v1.5.0 + hooks: + - id: yesqa + + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.16 + hooks: + - id: mdformat + additional_dependencies: + - mdformat-gfm + - mdformat-black + - mdformat_frontmatter + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.0.276 + hooks: + - id: ruff + args: ["--fix"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..764c44a --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ +.PHONY: ipynb clean docs + +# META := $(wildcard **/.meta.yml) +META := $(shell find -regex ".*/.meta.y[a]?ml") +IPYNB := $(META:%/.meta.yml=%.ipynb) +IPYNB := $(IPYNB:%/.meta.yaml=%.ipynb) +export PATH_DATASETS=$(PWD)/.datasets + +init: + @echo $(PATH_DATASETS) + mkdir -p $(PATH_DATASETS) + +ipynb: init ${IPYNB} +# @echo $< + +%.ipynb: %/.meta.y*ml + @echo $< + python .actions/assistant.py convert-ipynb $(shell dirname $<) + python .actions/assistant.py bash-render $(shell dirname $<) + bash .actions/_ipynb-render.sh + +docs: clean + pip install --quiet -r _requirements/docs.txt + python -m sphinx -b html -W --keep-going _docs/source _docs/build + +clean: + rm -rf ./.datasets + # clean all temp runs + rm -rf ./_docs/build + rm -rf ./_docs/source/notebooks + rm -rf ./_docs/source/api + rm -f ./dirs-*.txt + rm -f ./*-folders.txt + rm -f ./*/**/*.ipynb + rm -rf ./*/**/.ipynb_checkpoints + rm -rf ./*/**/venv + rm -rf ./*/**/logs + rm -rf ./*/**/lightning_logs + rm -f ./*/**/requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..5d8baa4 --- /dev/null +++ b/README.md @@ -0,0 +1,99 @@ +# PytorchLightning Tutorials + +[![CI internal](https://github.com/Lightning-AI/tutorials/actions/workflows/ci_test-acts.yml/badge.svg?event=push)](https://github.com/Lightning-AI/tutorials/actions/workflows/ci_test-acts.yml) +[![Build Status](https://dev.azure.com/Lightning-AI/Tutorials/_apis/build/status/Lightning-AI.tutorials%20%5Bpublish%5D?branchName=main)](https://dev.azure.com/Lightning-AI/Tutorials/_build/latest?definitionId=29&branchName=main) +[![codecov](https://codecov.io/gh/Lightning-AI/tutorials/branch/main/graph/badge.svg?token=C6T3XOOR56)](https://codecov.io/gh/Lightning-AI/tutorials) +[![Deploy Docs](https://github.com/Lightning-AI/tutorials/actions/workflows/docs-deploy.yml/badge.svg)](https://github.com/Lightning-AI/tutorials/actions/workflows/docs-deploy.yml) +[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/Lightning-AI/tutorials/main.svg)](https://results.pre-commit.ci/latest/github/Lightning-AI/tutorials/main) + +This is the Lightning Library - collection of Lightning related notebooks which are pulled back to the main repo as submodule and rendered inside the main documentations. +The key features/highlights: + +- we keep the repo **light-weighted** - notebooks are stored in rich script format +- all scripts/notebooks are tested to be **fully executable** +- fully **reproducible** by saving runtime env. details + +For more details read our blogpost - [Best Practices for Publishing PyTorch Lightning Tutorial Notebooks](https://devblog.pytorchlightning.ai/publishing-lightning-tutorials-cbea3eaa4b2c) + +## Adding/Editing notebooks + +This repo in main branch contain only python scripts with markdown extensions, and notebooks are generated in special publication branch, so no raw notebooks are accepted as PR. +On the other hand we highly recommend creating a notebooks and convert it script with [jupytext](https://jupytext.readthedocs.io/en/latest/) as + +```bash +jupytext --set-formats ipynb,py:percent my-notebook.ipynb +``` + +### Contribution structure + +The addition has to formed as new folder + +- the folder name is used for the future notebooks +- single python scripts with converted notebooks (name does not matter) +- metadata named `.meta.yaml` including following info: + ```yaml + title: Sample notebooks + author: [User](contact) + created: YYYY-MM-DD + updated: YYYY-MM-DD + license: CC BY-SA + # multi-line + description: | + This notebook will walk you through ... + requirements: + - package # with version if needed + # define supported - CPU|GPU|TPU + accelerator: + - CPU + ``` + +### Using datasets + +It is quite common to use some public or competition's dataset for your example. +We facilitate this via defining the data sources in the metafile. +There are two basic options, download a file from web or pul Kaggle dataset: + +```yaml +datasets: + web: + - https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz + kaggle: + - titanic +``` + +In both cases, the downloaded archive (Kaggle dataset is originally downloaded as zip file) is extracted to the default dataset folder under sub-folder with the same name as the downloaded file. +To get path to this dataset folder, please use environment variable `PATH_DATASETS`, so in your script use: + +```py +import os + +data_path = os.environ.get("PATH_DATASETS", "_datasets") +path_titanic = os.path.join(data_path, "titatnic") +``` + +**Warning:** some Kaggle datasets can be quite large and the process is - downloading and extracting, which means that particular runner needs to have double free space. For this reason, the CPU runner is limited to 3GB datasets. + +### Suggestions + +- For inserting images into text cells use MarkDown formatting, so we can insert inline images to the notebooks directly and drop eventual dependency on internet connection -> generated notebooks could be better shared offline +- If your images need special sizes, use `![Cation](my-image.png){height="60px" width="240px"}` +- If your notebook is computational or any other resource (CPU/RAM) demanding use only GPU accelerator option in meta config + +### Known limitations + +- Nothing major at this moment + +## Meantime notes + +On the back side of publishing workflow you can find in principle these three steps + +```bash +# 1) convert script to notebooks +jupytext --set-formats ipynb,py:percent notebook.py + +# 2) testing the created notebook +pytest -v notebook.ipynb --nbval + +# 3) generating notebooks outputs +papermill in-notebook.ipynb out-notebook.ipynb +``` diff --git a/_docs/.build_docs.sh b/_docs/.build_docs.sh new file mode 100644 index 0000000..0419bd6 --- /dev/null +++ b/_docs/.build_docs.sh @@ -0,0 +1,2 @@ +make clean +make html --debug --jobs $(nproc) diff --git a/_docs/Makefile b/_docs/Makefile new file mode 100644 index 0000000..197a2c7 --- /dev/null +++ b/_docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = -W +SPHINXBUILD = python $(shell which sphinx-build) +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/_docs/make.bat b/_docs/make.bat new file mode 100644 index 0000000..4d9eb83 --- /dev/null +++ b/_docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/_docs/source/_static/images/icon.svg b/_docs/source/_static/images/icon.svg new file mode 100644 index 0000000..5ab3512 --- /dev/null +++ b/_docs/source/_static/images/icon.svg @@ -0,0 +1,62 @@ + + + + + + image/svg+xml + + + + + + + + + + diff --git a/_docs/source/_static/images/logo.png b/_docs/source/_static/images/logo.png new file mode 100644 index 0000000..a28606b Binary files /dev/null and b/_docs/source/_static/images/logo.png differ diff --git a/_docs/source/_static/images/logo.svg b/_docs/source/_static/images/logo.svg new file mode 100755 index 0000000..ec6a2ee --- /dev/null +++ b/_docs/source/_static/images/logo.svg @@ -0,0 +1,70 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + diff --git a/_docs/source/_templates/theme_variables.jinja b/_docs/source/_templates/theme_variables.jinja new file mode 100644 index 0000000..95adb3e --- /dev/null +++ b/_docs/source/_templates/theme_variables.jinja @@ -0,0 +1,18 @@ +{%- set external_urls = { + 'github': 'https://github.com/Lightning-AI/lightning-sandbox', + 'github_issues': 'https://github.com/Lightning-AI/lightning-sandbox/issues', + 'contributing': 'https://github.com/Lightning-AI/lightning/blob/master/CONTRIBUTING.md', + 'governance': 'https://github.com/Lightning-AI/lightning/blob/master/governance.md', + 'docs': 'https://lightning-sandbox.rtfd.io/en/latest', + 'twitter': 'https://twitter.com/PyTorchLightnin', + 'discuss': 'https://pytorch-lightning.slack.com', + 'tutorials': 'https://pt-lightning-sandbox.readthedocs.io/en/latest/#tutorials', + 'previous_pytorch_versions': 'https://pt-lightning-sandbox.rtfd.io/en/latest/', + 'home': 'https://pt-lightning-sandbox.rtfd.io/en/latest/', + 'get_started': 'https://pt-lightning-sandbox.readthedocs.io/en/latest/introduction_guide.html', + 'features': 'https://pt-lightning-sandbox.rtfd.io/en/latest/', + 'blog': 'https://www.pytorchlightning.ai/blog', + 'resources': 'https://pt-lightning-sandbox.readthedocs.io/en/latest/#community-examples', + 'support': 'https://pt-lightning-sandbox.rtfd.io/en/latest/', +} +-%} diff --git a/_docs/source/conf.py b/_docs/source/conf.py new file mode 100644 index 0000000..6558509 --- /dev/null +++ b/_docs/source/conf.py @@ -0,0 +1,231 @@ +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +import os +import sys + +import pt_lightning_sphinx_theme + +_PATH_HERE = os.path.abspath(os.path.dirname(__file__)) +_PATH_ROOT = os.path.realpath(os.path.join(_PATH_HERE, "..", "..")) +sys.path.insert(0, os.path.abspath(_PATH_ROOT)) +sys.path.append(os.path.join(_PATH_ROOT, ".actions")) + +from assistant import AssistantCLI # noqa: E402 + +# -- Project information ----------------------------------------------------- + +# this name shall match the project name in Github as it is used for linking to code +project = "lightning-tutorials" +author = "PytorchLightning team" +copyright = f"Copyright (c) 2020-2021, {author}." +homepage = "https://www.pytorchlightning.ai" + +# # The short X.Y version +# version = about.__version__ +# # The full version, including alpha/beta/rc tags +# release = about.__version__ + +# Options for the linkcode extension +# ---------------------------------- +github_user = "PyTorchLightning" +github_repo = project + +# -- Project documents ------------------------------------------------------- + +AssistantCLI.copy_notebooks(_PATH_ROOT, _PATH_HERE) + +# with open(os.path.join(_PATH_HERE, 'ipynb_content.rst'), 'w') as fp: +# fp.write(os.linesep.join(ipynb_content)) + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. + +needs_sphinx = "6.2" + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.napoleon", + "sphinx.ext.imgmath", + "sphinx.ext.githubpages", + "nbsphinx", + "myst_parser", + "sphinx_paramlinks", + "pt_lightning_sphinx_theme.extensions.lightning", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# https://berkeley-stat159-f17.github.io/stat159-f17/lectures/14-sphinx..html#conf.py-(cont.) +# https://stackoverflow.com/questions/38526888/embed-ipython-notebook-in-sphinx-document +# I execute the notebooks manually in advance. If notebooks test the code, +# they should be run at build time. +nbsphinx_execute = "never" +nbsphinx_allow_errors = True +nbsphinx_requirejs_path = "" + +myst_update_mathjax = False + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_parsers = { + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", + ".ipynb": "nbsphinx", +} + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [ + "PULL_REQUEST_TEMPLATE.md", + "_build", + "**.ipynb_checkpoints", +] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "pt_lightning_sphinx_theme" +html_theme_path = [pt_lightning_sphinx_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. + +html_theme_options = { + "pytorch_project": homepage, + "canonical_url": homepage, + "collapse_navigation": False, + "display_version": True, + "logo_only": False, +} + +html_favicon = "_static/images/icon.svg" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_templates", "_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = project + "-doc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', + # Latex figure (float) alignment + "figure_align": "htbp", +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, project + ".tex", project + " Documentation", author, "manual"), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, project, project + " Documentation", [author], 1)] + +# -- Options for linkcheck builder ---------------------------------------------- +# regex pattern 0: allow linking to a specific selection state in +# tensorboard.dev links while continuing to validate the base experiment link +linkcheck_anchors_ignore = ["scalars.*&runSelectionState.*"] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + project, + project + " Documentation", + author, + project, + "" "Miscellaneous", # about.__docs__, + ), +] + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ["search.html"] + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +# intersphinx_mapping = { +# "python": ("https://docs.python.org/3", None), +# "torch": ("https://pytorch.org/docs/stable/", None), +# "numpy": ("https://docs.scipy.org/doc/numpy/", None), +# } diff --git a/_docs/source/index.rst b/_docs/source/index.rst new file mode 100644 index 0000000..1564f99 --- /dev/null +++ b/_docs/source/index.rst @@ -0,0 +1,32 @@ +.. PyTorchLightning-Sandbox documentation master file, created by + sphinx-quickstart on Wed Mar 25 21:34:07 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Lightning-Sandbox documentation +=============================== + +.. tutoriallist:: + +.. raw:: html + +
+ +.. toctree:: + :maxdepth: 1 + :name: start + :caption: Start here + :glob: + + notebooks/**/* + +.. raw:: html + +
+ +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/_requirements/data.txt b/_requirements/data.txt new file mode 100644 index 0000000..08fe313 --- /dev/null +++ b/_requirements/data.txt @@ -0,0 +1,2 @@ +# fixed version to be able to call it as `python -m kaggle` +https://github.com/Borda/kaggle-api/archive/refs/heads/setup/python-m.zip diff --git a/_requirements/default.txt b/_requirements/default.txt new file mode 100644 index 0000000..913834f --- /dev/null +++ b/_requirements/default.txt @@ -0,0 +1,7 @@ +setuptools==67.8.0 +matplotlib>=3.0.0, <3.8.0 +ipython[notebook]>=8.0.0, <8.15.0 +urllib3 # for ipython +torch>=1.8.1, <2.1.0 +pytorch-lightning>=1.4, <2.1.0 +torchmetrics>=0.7, <0.12 diff --git a/_requirements/devel.txt b/_requirements/devel.txt new file mode 100644 index 0000000..e0ad0dc --- /dev/null +++ b/_requirements/devel.txt @@ -0,0 +1,6 @@ +virtualenv>=20.10 +jupytext>=1.10, <1.15 # converting +pytest>=6.0, <7.0 +# testing with own fork with extended cell timeout +https://github.com/Borda/nbval/archive/refs/heads/timeout-limit.zip +papermill>=2.3.4, <2.5.0 # render diff --git a/_requirements/docs.txt b/_requirements/docs.txt new file mode 100644 index 0000000..7adb81b --- /dev/null +++ b/_requirements/docs.txt @@ -0,0 +1,11 @@ +sphinx >=6.0, <7.0 +myst-parser >=2.0.0 +nbsphinx >=0.8 +pandoc >=1.0 +#docutils>=0.16 +sphinx-paramlinks >=0.4.0 +ipython[notebook] >=8.0.0, <8.2.0 + +pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip + +-r ../.actions/requires.txt diff --git a/_requirements/test.txt b/_requirements/test.txt new file mode 100644 index 0000000..e505fef --- /dev/null +++ b/_requirements/test.txt @@ -0,0 +1,5 @@ +coverage>=5.0 +codecov>=2.1 +pytest>=6.0 +pytest-cov +jupytext diff --git a/course_UvA-DL/01-introduction-to-pytorch/.meta.yml b/course_UvA-DL/01-introduction-to-pytorch/.meta.yml new file mode 100644 index 0000000..1e5b5b9 --- /dev/null +++ b/course_UvA-DL/01-introduction-to-pytorch/.meta.yml @@ -0,0 +1,15 @@ +title: "Tutorial 1: Introduction to PyTorch" +author: Phillip Lippe +created: 2021-08-27 +updated: 2023-03-14 +license: CC BY-SA +description: | + This tutorial will give a short introduction to PyTorch basics, and get you setup for writing your own neural networks. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - matplotlib + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/01-introduction-to-pytorch/.thumb.jpg b/course_UvA-DL/01-introduction-to-pytorch/.thumb.jpg new file mode 100644 index 0000000..a56ca66 Binary files /dev/null and b/course_UvA-DL/01-introduction-to-pytorch/.thumb.jpg differ diff --git a/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py new file mode 100644 index 0000000..be0ca7e --- /dev/null +++ b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py @@ -0,0 +1,989 @@ +# %% [markdown] +#
+# Welcome to our PyTorch tutorial for the Deep Learning course 2020 at the University of Amsterdam! +# The following notebook is meant to give a short introduction to PyTorch basics, and get you setup for writing your own neural networks. +# PyTorch is an open source machine learning framework that allows you to write your own neural networks and optimize them efficiently. +# However, PyTorch is not the only framework of its kind. +# Alternatives to PyTorch include [TensorFlow](https://www.tensorflow.org/), [JAX](https://github.com/google/jax) and [Caffe](http://caffe.berkeleyvision.org/). +# We choose to teach PyTorch at the University of Amsterdam because it is well established, has a huge developer community (originally developed by Facebook), is very flexible and especially used in research. +# Many current papers publish their code in PyTorch, and thus it is good to be familiar with PyTorch as well. +# Meanwhile, TensorFlow (developed by Google) is usually known for being a production-grade deep learning library. +# Still, if you know one machine learning framework in depth, it is very easy to learn another one because many of them use the same concepts and ideas. +# For instance, TensorFlow's version 2 was heavily inspired by the most popular features of PyTorch, making the frameworks even more similar. +# If you are already familiar with PyTorch and have created your own neural network projects, feel free to just skim this notebook. +# +# We are of course not the first ones to create a PyTorch tutorial. +# There are many great tutorials online, including the ["60-min blitz"](https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html) on the official [PyTorch website](https://pytorch.org/tutorials/). +# Yet, we choose to create our own tutorial which is designed to give you the basics particularly necessary for the practicals, but still understand how PyTorch works under the hood. +# Over the next few weeks, we will also keep exploring new PyTorch features in the series of Jupyter notebook tutorials about deep learning. +# +# We will use a set of standard libraries that are often used in machine learning projects. +# If you are running this notebook on Google Colab, all libraries should be pre-installed. +# If you are running this notebook locally, make sure you have installed our `dl2020` environment ([link](https://github.com/uvadlc/uvadlc_practicals_2020/blob/master/environment.yml)) and have activated it. + +# %% +import time + +import matplotlib.pyplot as plt + +# %matplotlib inline +import matplotlib_inline.backend_inline +import numpy as np +import torch +import torch.nn as nn +import torch.utils.data as data +from matplotlib.colors import to_rgba +from torch import Tensor +from tqdm.notebook import tqdm # Progress bar + +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export + +# %% [markdown] +# ## The Basics of PyTorch +# +# We will start with reviewing the very basic concepts of PyTorch. +# As a prerequisite, we recommend to be familiar with the `numpy` package as most machine learning frameworks are based on very similar concepts. +# If you are not familiar with numpy yet, don't worry: here is a [tutorial](https://numpy.org/devdocs/user/quickstart.html) to go through. +# +# So, let's start with importing PyTorch. +# The package is called `torch`, based on its original framework [Torch](http://torch.ch/). +# As a first step, we can check its version: + +# %% +print("Using torch", torch.__version__) + +# %% [markdown] +# At the time of writing this tutorial (mid of August 2021), the current stable version is 1.9. +# You should therefore see the output `Using torch 1.9.0`, eventually with some extension for the CUDA version on Colab. +# In case you use the `dl2020` environment, you should see `Using torch 1.6.0` since the environment was provided in October 2020. +# It is recommended to update the PyTorch version to the newest one. +# If you see a lower version number than 1.6, make sure you have installed the correct the environment, or ask one of your TAs. +# In case PyTorch 1.10 or newer will be published during the time of the course, don't worry. +# The interface between PyTorch versions doesn't change too much, and hence all code should also be runnable with newer versions. +# +# As in every machine learning framework, PyTorch provides functions that are stochastic like generating random numbers. +# However, a very good practice is to setup your code to be reproducible with the exact same random numbers. +# This is why we set a seed below. + +# %% +torch.manual_seed(42) # Setting the seed + +# %% [markdown] +# ### Tensors +# +# Tensors are the PyTorch equivalent to Numpy arrays, with the addition to also have support for GPU acceleration (more on that later). +# The name "tensor" is a generalization of concepts you already know. +# For instance, a vector is a 1-D tensor, and a matrix a 2-D tensor. +# When working with neural networks, we will use tensors of various shapes and number of dimensions. +# +# Most common functions you know from numpy can be used on tensors as well. +# Actually, since numpy arrays are so similar to tensors, we can convert most tensors to numpy arrays (and back) but we don't need it too often. +# +# #### Initialization +# +# Let's first start by looking at different ways of creating a tensor. +# There are many possible options, the most simple one is to call +# `Tensor` passing the desired shape as input argument: + +# %% +x = Tensor(2, 3, 4) +print(x) + +# %% [markdown] +# The function `torch.Tensor` allocates memory for the desired tensor, but reuses any values that have already been in the memory. +# To directly assign values to the tensor during initialization, there are many alternatives including: +# +# * `torch.zeros`: Creates a tensor filled with zeros +# * `torch.ones`: Creates a tensor filled with ones +# * `torch.rand`: Creates a tensor with random values uniformly sampled between 0 and 1 +# * `torch.randn`: Creates a tensor with random values sampled from a normal distribution with mean 0 and variance 1 +# * `torch.arange`: Creates a tensor containing the values $N,N+1,N+2,...,M$ +# * `torch.Tensor` (input list): Creates a tensor from the list elements you provide + +# %% +# Create a tensor from a (nested) list +x = Tensor([[1, 2], [3, 4]]) +print(x) + +# %% +# Create a tensor with random values between 0 and 1 with the shape [2, 3, 4] +x = torch.rand(2, 3, 4) +print(x) + +# %% [markdown] +# You can obtain the shape of a tensor in the same way as in numpy (`x.shape`), or using the `.size` method: + +# %% +shape = x.shape +print("Shape:", x.shape) + +size = x.size() +print("Size:", size) + +dim1, dim2, dim3 = x.size() +print("Size:", dim1, dim2, dim3) + +# %% [markdown] +# #### Tensor to Numpy, and Numpy to Tensor +# +# Tensors can be converted to numpy arrays, and numpy arrays back to tensors. +# To transform a numpy array into a tensor, we can use the function `torch.from_numpy`: + +# %% +np_arr = np.array([[1, 2], [3, 4]]) +tensor = torch.from_numpy(np_arr) + +print("Numpy array:", np_arr) +print("PyTorch tensor:", tensor) + +# %% [markdown] +# To transform a PyTorch tensor back to a numpy array, we can use the function `.numpy()` on tensors: + +# %% +tensor = torch.arange(4) +np_arr = tensor.numpy() + +print("PyTorch tensor:", tensor) +print("Numpy array:", np_arr) + +# %% [markdown] +# The conversion of tensors to numpy require the tensor to be on the CPU, and not the GPU (more on GPU support in a later section). +# In case you have a tensor on GPU, you need to call `.cpu()` on the tensor beforehand. +# Hence, you get a line like `np_arr = tensor.cpu().numpy()`. + +# %% [markdown] +# #### Operations +# +# Most operations that exist in numpy, also exist in PyTorch. +# A full list of operations can be found in the [PyTorch documentation](https://pytorch.org/docs/stable/tensors.html#), but we will review the most important ones here. +# +# The simplest operation is to add two tensors: + +# %% +x1 = torch.rand(2, 3) +x2 = torch.rand(2, 3) +y = x1 + x2 + +print("X1", x1) +print("X2", x2) +print("Y", y) + +# %% [markdown] +# Calling `x1 + x2` creates a new tensor containing the sum of the two inputs. +# However, we can also use in-place operations that are applied directly on the memory of a tensor. +# We therefore change the values of `x2` without the chance to re-accessing the values of `x2` before the operation. +# An example is shown below: + +# %% +x1 = torch.rand(2, 3) +x2 = torch.rand(2, 3) +print("X1 (before)", x1) +print("X2 (before)", x2) + +x2.add_(x1) +print("X1 (after)", x1) +print("X2 (after)", x2) + +# %% [markdown] +# In-place operations are usually marked with a underscore postfix (for example `torch.add_` instead of `torch.add`). +# +# Another common operation aims at changing the shape of a tensor. +# A tensor of size (2,3) can be re-organized to any other shape with the same number of elements (e.g. a tensor of size (6), or (3,2), ...). +# In PyTorch, this operation is called `view`: + +# %% +x = torch.arange(6) +print("X", x) + +# %% +x = x.view(2, 3) +print("X", x) + +# %% +x = x.permute(1, 0) # Swapping dimension 0 and 1 +print("X", x) + +# %% [markdown] +# Other commonly used operations include matrix multiplications, which are essential for neural networks. +# Quite often, we have an input vector $\mathbf{x}$, which is transformed using a learned weight matrix $\mathbf{W}$. +# There are multiple ways and functions to perform matrix multiplication, some of which we list below: +# +# * `torch.matmul`: Performs the matrix product over two tensors, where the specific behavior depends on the dimensions. +# If both inputs are matrices (2-dimensional tensors), it performs the standard matrix product. +# For higher dimensional inputs, the function supports broadcasting (for details see the [documentation](https://pytorch.org/docs/stable/generated/torch.matmul.html?highlight=matmul#torch.matmul)). +# Can also be written as `a @ b`, similar to numpy. +# * `torch.mm`: Performs the matrix product over two matrices, but doesn't support broadcasting (see [documentation](https://pytorch.org/docs/stable/generated/torch.mm.html?highlight=torch%20mm#torch.mm)) +# * `torch.bmm`: Performs the matrix product with a support batch dimension. +# If the first tensor $T$ is of shape ($b\times n\times m$), and the second tensor $R$ ($b\times m\times p$), the output $O$ is of shape ($b\times n\times p$), and has been calculated by performing $b$ matrix multiplications of the submatrices of $T$ and $R$: $O_i = T_i @ R_i$ +# * `torch.einsum`: Performs matrix multiplications and more (i.e. sums of products) using the Einstein summation convention. +# Explanation of the Einstein sum can be found in assignment 1. +# +# Usually, we use `torch.matmul` or `torch.bmm`. We can try a matrix multiplication with `torch.matmul` below. + +# %% +x = torch.arange(6) +x = x.view(2, 3) +print("X", x) + +# %% +W = torch.arange(9).view(3, 3) # We can also stack multiple operations in a single line +print("W", W) + +# %% +h = torch.matmul(x, W) # Verify the result by calculating it by hand too! +print("h", h) + +# %% [markdown] +# #### Indexing +# +# We often have the situation where we need to select a part of a tensor. +# Indexing works just like in numpy, so let's try it: + +# %% +x = torch.arange(12).view(3, 4) +print("X", x) + +# %% +print(x[:, 1]) # Second column + +# %% +print(x[0]) # First row + +# %% +print(x[:2, -1]) # First two rows, last column + +# %% +print(x[1:3, :]) # Middle two rows + +# %% [markdown] +# ### Dynamic Computation Graph and Backpropagation +# +# One of the main reasons for using PyTorch in Deep Learning projects is that we can automatically get **gradients/derivatives** of functions that we define. +# We will mainly use PyTorch for implementing neural networks, and they are just fancy functions. +# If we use weight matrices in our function that we want to learn, then those are called the **parameters** or simply the **weights**. +# +# If our neural network would output a single scalar value, we would talk about taking the **derivative**, but you will see that quite often we will have **multiple** output variables ("values"); in that case we talk about **gradients**. +# It's a more general term. +# +# Given an input $\mathbf{x}$, we define our function by **manipulating** that input, usually by matrix-multiplications with weight matrices and additions with so-called bias vectors. +# As we manipulate our input, we are automatically creating a **computational graph**. +# This graph shows how to arrive at our output from our input. +# PyTorch is a **define-by-run** framework; this means that we can just do our manipulations, and PyTorch will keep track of that graph for us. +# Thus, we create a dynamic computation graph along the way. +# +# So, to recap: the only thing we have to do is to compute the **output**, and then we can ask PyTorch to automatically get the **gradients**. +# +# > **Note: Why do we want gradients? +# ** Consider that we have defined a function, a neural net, that is supposed to compute a certain output $y$ for an input vector $\mathbf{x}$. +# We then define an **error measure** that tells us how wrong our network is; how bad it is in predicting output $y$ from input $\mathbf{x}$. +# Based on this error measure, we can use the gradients to **update** the weights $\mathbf{W}$ that were responsible for the output, so that the next time we present input $\mathbf{x}$ to our network, the output will be closer to what we want. +# +# The first thing we have to do is to specify which tensors require gradients. +# By default, when we create a tensor, it does not require gradients. + +# %% +x = torch.ones((3,)) +print(x.requires_grad) + +# %% [markdown] +# We can change this for an existing tensor using the function `requires_grad_()` (underscore indicating that this is a in-place operation). +# Alternatively, when creating a tensor, you can pass the argument +# `requires_grad=True` to most initializers we have seen above. + +# %% +x.requires_grad_(True) +print(x.requires_grad) + +# %% [markdown] +# In order to get familiar with the concept of a computation graph, we will create one for the following function: +# +# $$y = \frac{1}{|x|}\sum_i \left[(x_i + 2)^2 + 3\right]$$ +# +# You could imagine that $x$ are our parameters, and we want to optimize (either maximize or minimize) the output $y$. +# For this, we want to obtain the gradients $\partial y / \partial \mathbf{x}$. +# For our example, we'll use $\mathbf{x}=[0,1,2]$ as our input. + +# %% +x = torch.arange(3, dtype=torch.float32, requires_grad=True) # Only float tensors can have gradients +print("X", x) + +# %% [markdown] +# Now let's build the computation graph step by step. +# You can combine multiple operations in a single line, but we will +# separate them here to get a better understanding of how each operation +# is added to the computation graph. + +# %% +a = x + 2 +b = a**2 +c = b + 3 +y = c.mean() +print("Y", y) + +# %% [markdown] +# Using the statements above, we have created a computation graph that looks similar to the figure below: +# +#
+# +# We calculate $a$ based on the inputs $x$ and the constant $2$, $b$ is $a$ squared, and so on. +# The visualization is an abstraction of the dependencies between inputs and outputs of the operations we have applied. +# Each node of the computation graph has automatically defined a function for calculating the gradients with respect to its inputs, `grad_fn`. +# You can see this when we printed the output tensor $y$. +# This is why the computation graph is usually visualized in the reverse direction (arrows point from the result to the inputs). +# We can perform backpropagation on the computation graph by calling the +# function `backward()` on the last output, which effectively calculates +# the gradients for each tensor that has the property +# `requires_grad=True`: + +# %% +y.backward() + +# %% [markdown] +# `x.grad` will now contain the gradient $\partial y/ \partial \mathcal{x}$, and this gradient indicates how a change in $\mathbf{x}$ will affect output $y$ given the current input $\mathbf{x}=[0,1,2]$: + +# %% +print(x.grad) + +# %% [markdown] +# We can also verify these gradients by hand. +# We will calculate the gradients using the chain rule, in the same way as PyTorch did it: +# +# $$\frac{\partial y}{\partial x_i} = \frac{\partial y}{\partial c_i}\frac{\partial c_i}{\partial b_i}\frac{\partial b_i}{\partial a_i}\frac{\partial a_i}{\partial x_i}$$ +# +# Note that we have simplified this equation to index notation, and by using the fact that all operation besides the mean do not combine the elements in the tensor. +# The partial derivatives are: +# +# $$ +# \frac{\partial a_i}{\partial x_i} = 1,\hspace{1cm} +# \frac{\partial b_i}{\partial a_i} = 2\cdot a_i\hspace{1cm} +# \frac{\partial c_i}{\partial b_i} = 1\hspace{1cm} +# \frac{\partial y}{\partial c_i} = \frac{1}{3} +# $$ +# +# Hence, with the input being $\mathbf{x}=[0,1,2]$, our gradients are $\partial y/\partial \mathbf{x}=[4/3,2,8/3]$. +# The previous code cell should have printed the same result. + +# %% [markdown] +# ### GPU support +# +# A crucial feature of PyTorch is the support of GPUs, short for Graphics Processing Unit. +# A GPU can perform many thousands of small operations in parallel, making it very well suitable for performing large matrix operations in neural networks. +# When comparing GPUs to CPUs, we can list the following main differences (credit: [Kevin Krewell, 2009](https://blogs.nvidia.com/blog/2009/12/16/whats-the-difference-between-a-cpu-and-a-gpu/)) +# +#
+# +# CPUs and GPUs have both different advantages and disadvantages, which is why many computers contain both components and use them for different tasks. +# In case you are not familiar with GPUs, you can read up more details in this [NVIDIA blog post](https://blogs.nvidia.com/blog/2009/12/16/whats-the-difference-between-a-cpu-and-a-gpu/) or [here](https://www.intel.com/content/www/us/en/products/docs/processors/what-is-a-gpu.html). +# +# GPUs can accelerate the training of your network up to a factor of $100$ which is essential for large neural networks. +# PyTorch implements a lot of functionality for supporting GPUs (mostly those of NVIDIA due to the libraries [CUDA](https://developer.nvidia.com/cuda-zone) and [cuDNN](https://developer.nvidia.com/cudnn)). +# First, let's check whether you have a GPU available: + +# %% +gpu_avail = torch.cuda.is_available() +print(f"Is the GPU available? {gpu_avail}") + +# %% [markdown] +# If you have a GPU on your computer but the command above returns False, make sure you have the correct CUDA-version installed. +# The `dl2020` environment comes with the CUDA-toolkit 10.1, which is selected for the Lisa supercomputer. +# Please change it if necessary (CUDA 10.2 is currently common). +# On Google Colab, make sure that you have selected a GPU in your runtime setup (in the menu, check under `Runtime -> Change runtime type`). +# +# By default, all tensors you create are stored on the CPU. +# We can push a tensor to the GPU by using the function `.to(...)`, or `.cuda()`. +# However, it is often a good practice to define a `device` object in your code which points to the GPU if you have one, and otherwise to the CPU. +# Then, you can write your code with respect to this device object, and it allows you to run the same code on both a CPU-only system, and one with a GPU. +# Let's try it below. +# We can specify the device as follows: + +# %% +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +print("Device", device) + +# %% [markdown] +# Now let's create a tensor and push it to the device: + +# %% +x = torch.zeros(2, 3) +x = x.to(device) +print("X", x) + +# %% [markdown] +# In case you have a GPU, you should now see the attribute `device='cuda:0'` being printed next to your tensor. +# The zero next to cuda indicates that this is the zero-th GPU device on your computer. +# PyTorch also supports multi-GPU systems, but this you will only need once you have very big networks to train (if interested, see the [PyTorch documentation](https://pytorch.org/docs/stable/distributed.html#distributed-basics)). +# We can also compare the runtime of a large matrix multiplication on the CPU with a operation on the GPU: + +# %% +x = torch.randn(5000, 5000) + +# CPU version +start_time = time.time() +_ = torch.matmul(x, x) +end_time = time.time() +print(f"CPU time: {(end_time - start_time):6.5f}s") + +# GPU version +if torch.cuda.is_available(): + x = x.to(device) + # CUDA is asynchronous, so we need to use different timing functions + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + _ = torch.matmul(x, x) + end.record() + torch.cuda.synchronize() # Waits for everything to finish running on the GPU + print(f"GPU time: {0.001 * start.elapsed_time(end):6.5f}s") # Milliseconds to seconds + +# %% [markdown] +# Depending on the size of the operation and the CPU/GPU in your system, the speedup of this operation can be >50x. +# As `matmul` operations are very common in neural networks, we can already see the great benefit of training a NN on a GPU. +# The time estimate can be relatively noisy here because we haven't run it for multiple times. +# Feel free to extend this, but it also takes longer to run. +# +# When generating random numbers, the seed between CPU and GPU is not synchronized. +# Hence, we need to set the seed on the GPU separately to ensure a reproducible code. +# Note that due to different GPU architectures, running the same code on different GPUs does not guarantee the same random numbers. +# Still, we don't want that our code gives us a different output every time we run it on the exact same hardware. +# Hence, we also set the seed on the GPU: + +# %% +# GPU operations have a separate seed we also want to set +if torch.cuda.is_available(): + torch.cuda.manual_seed(42) + torch.cuda.manual_seed_all(42) + +# Additionally, some operations on a GPU are implemented stochastic for efficiency +# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# %% [markdown] +# ## Learning by example: Continuous XOR +#
+# +# If we want to build a neural network in PyTorch, we could specify all our parameters (weight matrices, bias vectors) using `Tensors` (with `requires_grad=True`), ask PyTorch to calculate the gradients and then adjust the parameters. +# But things can quickly get cumbersome if we have a lot of parameters. +# In PyTorch, there is a package called `torch.nn` that makes building neural networks more convenient. +# +# We will introduce the libraries and all additional parts you might need to train a neural network in PyTorch, using a simple example classifier on a simple yet well known example: XOR. +# Given two binary inputs $x_1$ and $x_2$, the label to predict is $1$ if either $x_1$ or $x_2$ is $1$ while the other is $0$, or the label is $0$ in all other cases. +# The example became famous by the fact that a single neuron, i.e. a linear classifier, cannot learn this simple function. +# Hence, we will learn how to build a small neural network that can learn this function. +# To make it a little bit more interesting, we move the XOR into continuous space and introduce some gaussian noise on the binary inputs. +# Our desired separation of an XOR dataset could look as follows: +# +#
+ +# %% [markdown] +# ### The model +# +# The package `torch.nn` defines a series of useful classes like linear networks layers, activation functions, loss functions etc. +# A full list can be found [here](https://pytorch.org/docs/stable/nn.html). +# In case you need a certain network layer, check the documentation of the package first before writing the layer yourself as the package likely contains the code for it already. +# We import it below: + +# %% +# %% + +# %% [markdown] +# Additionally to `torch.nn`, there is also `torch.nn.functional`. +# It contains functions that are used in network layers. +# This is in contrast to `torch.nn` which defines them as `nn.Modules` (more on it below), and `torch.nn` actually uses a lot of functionalities from `torch.nn.functional`. +# Hence, the functional package is useful in many situations, and so we import it as well here. + +# %% [markdown] +# #### nn.Module +# +# In PyTorch, a neural network is built up out of modules. +# Modules can contain other modules, and a neural network is considered to be a module itself as well. +# The basic template of a module is as follows: + + +# %% +class MyModule(nn.Module): + def __init__(self): + super().__init__() + # Some init for my module + + def forward(self, x): + # Function for performing the calculation of the module. + pass + + +# %% [markdown] +# The forward function is where the computation of the module is taken place, and is executed when you call the module (`nn = MyModule(); nn(x)`). +# In the init function, we usually create the parameters of the module, using `nn.Parameter`, or defining other modules that are used in the forward function. +# The backward calculation is done automatically, but could be overwritten as well if wanted. +# +# #### Simple classifier +# We can now make use of the pre-defined modules in the `torch.nn` package, and define our own small neural network. +# We will use a minimal network with a input layer, one hidden layer with tanh as activation function, and a output layer. +# In other words, our networks should look something like this: +# +#
+# +# The input neurons are shown in blue, which represent the coordinates $x_1$ and $x_2$ of a data point. +# The hidden neurons including a tanh activation are shown in white, and the output neuron in red. +# In PyTorch, we can define this as follows: + + +# %% +class SimpleClassifier(nn.Module): + def __init__(self, num_inputs, num_hidden, num_outputs): + super().__init__() + # Initialize the modules we need to build the network + self.linear1 = nn.Linear(num_inputs, num_hidden) + self.act_fn = nn.Tanh() + self.linear2 = nn.Linear(num_hidden, num_outputs) + + def forward(self, x): + # Perform the calculation of the model to determine the prediction + x = self.linear1(x) + x = self.act_fn(x) + x = self.linear2(x) + return x + + +# %% [markdown] +# For the examples in this notebook, we will use a tiny neural network with two input neurons and four hidden neurons. +# As we perform binary classification, we will use a single output neuron. +# Note that we do not apply a sigmoid on the output yet. +# This is because other functions, especially the loss, are more efficient and precise to calculate on the original outputs instead of the sigmoid output. +# We will discuss the detailed reason later. + +# %% +model = SimpleClassifier(num_inputs=2, num_hidden=4, num_outputs=1) +# Printing a module shows all its submodules +print(model) + +# %% [markdown] +# Printing the model lists all submodules it contains. +# The parameters of a module can be obtained by using its `parameters()` functions, or `named_parameters()` to get a name to each parameter object. +# For our small neural network, we have the following parameters: + +# %% +for name, param in model.named_parameters(): + print(f"Parameter {name}, shape {param.shape}") + +# %% [markdown] +# Each linear layer has a weight matrix of the shape `[output, input]`, and a bias of the shape `[output]`. +# The tanh activation function does not have any parameters. +# Note that parameters are only registered for `nn.Module` objects that are direct object attributes, i.e. `self.a = ...`. +# If you define a list of modules, the parameters of those are not registered for the outer module and can cause some issues when you try to optimize your module. +# There are alternatives, like `nn.ModuleList`, `nn.ModuleDict` and `nn.Sequential`, that allow you to have different data structures of modules. +# We will use them in a few later tutorials and explain them there. + +# %% [markdown] +# ### The data +# +# PyTorch also provides a few functionalities to load the training and +# test data efficiently, summarized in the package `torch.utils.data`. + +# %% + +# %% [markdown] +# The data package defines two classes which are the standard interface for handling data in PyTorch: `data.Dataset`, and `data.DataLoader`. +# The dataset class provides an uniform interface to access the +# training/test data, while the data loader makes sure to efficiently load +# and stack the data points from the dataset into batches during training. + +# %% [markdown] +# #### The dataset class +# +# The dataset class summarizes the basic functionality of a dataset in a natural way. +# To define a dataset in PyTorch, we simply specify two functions: `__getitem__`, and `__len__`. +# The get-item function has to return the $i$-th data point in the dataset, while the len function returns the size of the dataset. +# For the XOR dataset, we can define the dataset class as follows: + +# %% + + +class XORDataset(data.Dataset): + def __init__(self, size, std=0.1): + """XORDataset. + + Args: + size: Number of data points we want to generate + std: Standard deviation of the noise (see generate_continuous_xor function) + """ + super().__init__() + self.size = size + self.std = std + self.generate_continuous_xor() + + def generate_continuous_xor(self): + # Each data point in the XOR dataset has two variables, x and y, that can be either 0 or 1 + # The label is their XOR combination, i.e. 1 if only x or only y is 1 while the other is 0. + # If x=y, the label is 0. + data = torch.randint(low=0, high=2, size=(self.size, 2), dtype=torch.float32) + label = (data.sum(dim=1) == 1).to(torch.long) + # To make it slightly more challenging, we add a bit of gaussian noise to the data points. + data += self.std * torch.randn(data.shape) + + self.data = data + self.label = label + + def __len__(self): + # Number of data point we have. Alternatively self.data.shape[0], or self.label.shape[0] + return self.size + + def __getitem__(self, idx): + # Return the idx-th data point of the dataset + # If we have multiple things to return (data point and label), we can return them as tuple + data_point = self.data[idx] + data_label = self.label[idx] + return data_point, data_label + + +# %% [markdown] +# Let's try to create such a dataset and inspect it: + +# %% +dataset = XORDataset(size=200) +print("Size of dataset:", len(dataset)) +print("Data point 0:", dataset[0]) + +# %% [markdown] +# To better relate to the dataset, we visualize the samples below. + + +# %% +def visualize_samples(data, label): + if isinstance(data, Tensor): + data = data.cpu().numpy() + if isinstance(label, Tensor): + label = label.cpu().numpy() + data_0 = data[label == 0] + data_1 = data[label == 1] + + plt.figure(figsize=(4, 4)) + plt.scatter(data_0[:, 0], data_0[:, 1], edgecolor="#333", label="Class 0") + plt.scatter(data_1[:, 0], data_1[:, 1], edgecolor="#333", label="Class 1") + plt.title("Dataset samples") + plt.ylabel(r"$x_2$") + plt.xlabel(r"$x_1$") + plt.legend() + + +# %% +visualize_samples(dataset.data, dataset.label) +plt.show() + +# %% [markdown] +# #### The data loader class +# +# The class `torch.utils.data.DataLoader` represents a Python iterable over a dataset with support for automatic batching, multi-process data loading and many more features. +# The data loader communicates with the dataset using the function `__getitem__`, and stacks its outputs as tensors over the first dimension to form a batch. +# In contrast to the dataset class, we usually don't have to define our own data loader class, but can just create an object of it with the dataset as input. +# Additionally, we can configure our data loader with the following input arguments (only a selection, see full list [here](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)): +# +# * `batch_size`: Number of samples to stack per batch +# * `shuffle`: If True, the data is returned in a random order. +# This is important during training for introducing stochasticity. +# * `num_workers`: Number of subprocesses to use for data loading. +# The default, 0, means that the data will be loaded in the main process which can slow down training for datasets where loading a data point takes a considerable amount of time (e.g. large images). +# More workers are recommended for those, but can cause issues on Windows computers. +# For tiny datasets as ours, 0 workers are usually faster. +# * `pin_memory`: If True, the data loader will copy Tensors into CUDA pinned memory before returning them. +# This can save some time for large data points on GPUs. +# Usually a good practice to use for a training set, but not necessarily for validation and test to save memory on the GPU. +# * `drop_last`: If True, the last batch is dropped in case it is smaller than the specified batch size. +# This occurs when the dataset size is not a multiple of the batch size. +# Only potentially helpful during training to keep a consistent batch size. +# +# Let's create a simple data loader below: + +# %% +data_loader = data.DataLoader(dataset, batch_size=8, shuffle=True) + +# %% +# next(iter(...)) catches the first batch of the data loader +# If shuffle is True, this will return a different batch every time we run this cell +# For iterating over the whole dataset, we can simple use "for batch in data_loader: ..." +data_inputs, data_labels = next(iter(data_loader)) + +# The shape of the outputs are [batch_size, d_1,...,d_N] where d_1,...,d_N are the +# dimensions of the data point returned from the dataset class +print("Data inputs", data_inputs.shape, "\n", data_inputs) +print("Data labels", data_labels.shape, "\n", data_labels) + +# %% [markdown] +# ### Optimization +# +# After defining the model and the dataset, it is time to prepare the optimization of the model. +# During training, we will perform the following steps: +# +# 1. Get a batch from the data loader +# 2. Obtain the predictions from the model for the batch +# 3. Calculate the loss based on the difference between predictions and labels +# 4. Backpropagation: calculate the gradients for every parameter with respect to the loss +# 5. Update the parameters of the model in the direction of the gradients +# +# We have seen how we can do step 1, 2 and 4 in PyTorch. Now, we will look at step 3 and 5. + +# %% [markdown] +# #### Loss modules +# +# We can calculate the loss for a batch by simply performing a few tensor operations as those are automatically added to the computation graph. +# For instance, for binary classification, we can use Binary Cross Entropy (BCE) which is defined as follows: +# +# $$\mathcal{L}_{BCE} = -\sum_i \left[ y_i \log x_i + (1 - y_i) \log (1 - x_i) \right]$$ +# +# where $y$ are our labels, and $x$ our predictions, both in the range of $[0,1]$. +# However, PyTorch already provides a list of predefined loss functions which we can use (see [here](https://pytorch.org/docs/stable/nn.html#loss-functions) for a full list). +# For instance, for BCE, PyTorch has two modules: `nn.BCELoss()`, `nn.BCEWithLogitsLoss()`. +# While `nn.BCELoss` expects the inputs $x$ to be in the range $[0,1]$, i.e. the output of a sigmoid, `nn.BCEWithLogitsLoss` combines a sigmoid layer and the BCE loss in a single class. +# This version is numerically more stable than using a plain Sigmoid followed by a BCE loss because of the logarithms applied in the loss function. +# Hence, it is adviced to use loss functions applied on "logits" where possible (remember to not apply a sigmoid on the output of the model in this case!). +# For our model defined above, we therefore use the module `nn.BCEWithLogitsLoss`. + +# %% +loss_module = nn.BCEWithLogitsLoss() + +# %% [markdown] +# #### Stochastic Gradient Descent +# +# For updating the parameters, PyTorch provides the package `torch.optim` that has most popular optimizers implemented. +# We will discuss the specific optimizers and their differences later in the course, but will for now use the simplest of them: `torch.optim.SGD`. +# Stochastic Gradient Descent updates parameters by multiplying the gradients with a small constant, called learning rate, and subtracting those from the parameters (hence minimizing the loss). +# Therefore, we slowly move towards the direction of minimizing the loss. +# A good default value of the learning rate for a small network as ours is 0.1. + +# %% +# Input to the optimizer are the parameters of the model: model.parameters() +optimizer = torch.optim.SGD(model.parameters(), lr=0.1) + +# %% [markdown] +# The optimizer provides two useful functions: `optimizer.step()`, and `optimizer.zero_grad()`. +# The step function updates the parameters based on the gradients as explained above. +# The function `optimizer.zero_grad()` sets the gradients of all parameters to zero. +# While this function seems less relevant at first, it is a crucial pre-step before performing backpropagation. +# If we would call the `backward` function on the loss while the parameter gradients are non-zero from the previous batch, the new gradients would actually be added to the previous ones instead of overwriting them. +# This is done because a parameter might occur multiple times in a computation graph, and we need to sum the gradients in this case instead of replacing them. +# Hence, remember to call `optimizer.zero_grad()` before calculating the gradients of a batch. + +# %% [markdown] +# ### Training +# +# Finally, we are ready to train our model. +# As a first step, we create a slightly larger dataset and specify a data loader with a larger batch size. + +# %% +train_dataset = XORDataset(size=1000) +train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True) + +# %% [markdown] +# Now, we can write a small training function. +# Remember our five steps: load a batch, obtain the predictions, calculate the loss, backpropagate, and update. +# Additionally, we have to push all data and model parameters to the device of our choice (GPU if available). +# For the tiny neural network we have, communicating the data to the GPU actually takes much more time than we could save from running the operation on GPU. +# For large networks, the communication time is significantly smaller than the actual runtime making a GPU crucial in these cases. +# Still, to practice, we will push the data to GPU here. + +# %% +# Push model to device. Has to be only done once +model.to(device) + +# %% [markdown] +# In addition, we set our model to training mode. +# This is done by calling `model.train()`. +# There exist certain modules that need to perform a different forward +# step during training than during testing (e.g. BatchNorm and Dropout), +# and we can switch between them using `model.train()` and `model.eval()`. + + +# %% +def train_model(model, optimizer, data_loader, loss_module, num_epochs=100): + # Set model to train mode + model.train() + + # Training loop + for epoch in tqdm(range(num_epochs)): + for data_inputs, data_labels in data_loader: + # Step 1: Move input data to device (only strictly necessary if we use GPU) + data_inputs = data_inputs.to(device) + data_labels = data_labels.to(device) + + # Step 2: Run the model on the input data + preds = model(data_inputs) + preds = preds.squeeze(dim=1) # Output is [Batch size, 1], but we want [Batch size] + + # Step 3: Calculate the loss + loss = loss_module(preds, data_labels.float()) + + # Step 4: Perform backpropagation + # Before calculating the gradients, we need to ensure that they are all zero. + # The gradients would not be overwritten, but actually added to the existing ones. + optimizer.zero_grad() + # Perform backpropagation + loss.backward() + + # Step 5: Update the parameters + optimizer.step() + + +# %% +train_model(model, optimizer, train_data_loader, loss_module) + +# %% [markdown] +# #### Saving a model +# +# After finish training a model, we save the model to disk so that we can load the same weights at a later time. +# For this, we extract the so-called `state_dict` from the model which contains all learnable parameters. +# For our simple model, the state dict contains the following entries: + +# %% +state_dict = model.state_dict() +print(state_dict) + +# %% [markdown] +# To save the state dictionary, we can use `torch.save`: + +# %% +# torch.save(object, filename). For the filename, any extension can be used +torch.save(state_dict, "our_model.tar") + +# %% [markdown] +# To load a model from a state dict, we use the function `torch.load` to +# load the state dict from the disk, and the module function +# `load_state_dict` to overwrite our parameters with the new values: + +# %% +# Load state dict from the disk (make sure it is the same name as above) +state_dict = torch.load("our_model.tar") + +# Create a new model and load the state +new_model = SimpleClassifier(num_inputs=2, num_hidden=4, num_outputs=1) +new_model.load_state_dict(state_dict) + +# Verify that the parameters are the same +print("Original model\n", model.state_dict()) +print("\nLoaded model\n", new_model.state_dict()) + +# %% [markdown] +# A detailed tutorial on saving and loading models in PyTorch can be found +# [here](https://pytorch.org/tutorials/beginner/saving_loading_models.html). + +# %% [markdown] +# ### Evaluation +# +# Once we have trained a model, it is time to evaluate it on a held-out test set. +# As our dataset consist of randomly generated data points, we need to +# first create a test set with a corresponding data loader. + +# %% +test_dataset = XORDataset(size=500) +# drop_last -> Don't drop the last batch although it is smaller than 128 +test_data_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False, drop_last=False) + +# %% [markdown] +# As metric, we will use accuracy which is calculated as follows: +# +# $$acc = \frac{\#\text{correct predictions}}{\#\text{all predictions}} = \frac{TP+TN}{TP+TN+FP+FN}$$ +# +# where TP are the true positives, TN true negatives, FP false positives, and FN the fale negatives. +# +# When evaluating the model, we don't need to keep track of the computation graph as we don't intend to calculate the gradients. +# This reduces the required memory and speed up the model. +# In PyTorch, we can deactivate the computation graph using `with torch.no_grad(): ...`. +# Remember to additionally set the model to eval mode. + + +# %% +def eval_model(model, data_loader): + model.eval() # Set model to eval mode + true_preds, num_preds = 0.0, 0.0 + + with torch.no_grad(): # Deactivate gradients for the following code + for data_inputs, data_labels in data_loader: + # Determine prediction of model on dev set + data_inputs, data_labels = data_inputs.to(device), data_labels.to(device) + preds = model(data_inputs) + preds = preds.squeeze(dim=1) + preds = torch.sigmoid(preds) # Sigmoid to map predictions between 0 and 1 + pred_labels = (preds >= 0.5).long() # Binarize predictions to 0 and 1 + + # Keep records of predictions for the accuracy metric (true_preds=TP+TN, num_preds=TP+TN+FP+FN) + true_preds += (pred_labels == data_labels).sum() + num_preds += data_labels.shape[0] + + acc = true_preds / num_preds + print(f"Accuracy of the model: {100.0*acc:4.2f}%") + + +# %% +eval_model(model, test_data_loader) + +# %% [markdown] +# If we trained our model correctly, we should see a score close to 100% accuracy. +# However, this is only possible because of our simple task, and +# unfortunately, we usually don't get such high scores on test sets of +# more complex tasks. + +# %% [markdown] +# #### Visualizing classification boundaries +# +# To visualize what our model has learned, we can perform a prediction for every data point in a range of $[-0.5, 1.5]$, and visualize the predicted class as in the sample figure at the beginning of this section. +# This shows where the model has created decision boundaries, and which points would be classified as $0$, and which as $1$. +# We therefore get a background image out of blue (class 0) and orange (class 1). +# The spots where the model is uncertain we will see a blurry overlap. +# The specific code is less relevant compared to the output figure which +# should hopefully show us a clear separation of classes: + + +# %% +@torch.no_grad() # Decorator, same effect as "with torch.no_grad(): ..." over the whole function. +def visualize_classification(model, data, label): + if isinstance(data, Tensor): + data = data.cpu().numpy() + if isinstance(label, Tensor): + label = label.cpu().numpy() + data_0 = data[label == 0] + data_1 = data[label == 1] + + plt.figure(figsize=(4, 4)) + plt.scatter(data_0[:, 0], data_0[:, 1], edgecolor="#333", label="Class 0") + plt.scatter(data_1[:, 0], data_1[:, 1], edgecolor="#333", label="Class 1") + plt.title("Dataset samples") + plt.ylabel(r"$x_2$") + plt.xlabel(r"$x_1$") + plt.legend() + + # Let's make use of a lot of operations we have learned above + model.to(device) + c0 = Tensor(to_rgba("C0")).to(device) + c1 = Tensor(to_rgba("C1")).to(device) + x1 = torch.arange(-0.5, 1.5, step=0.01, device=device) + x2 = torch.arange(-0.5, 1.5, step=0.01, device=device) + xx1, xx2 = torch.meshgrid(x1, x2) # Meshgrid function as in numpy + model_inputs = torch.stack([xx1, xx2], dim=-1) + preds = model(model_inputs) + preds = torch.sigmoid(preds) + # Specifying "None" in a dimension creates a new one + output_image = (1 - preds) * c0[None, None] + preds * c1[None, None] + output_image = ( + output_image.cpu().numpy() + ) # Convert to numpy array. This only works for tensors on CPU, hence first push to CPU + plt.imshow(output_image, origin="lower", extent=(-0.5, 1.5, -0.5, 1.5)) + plt.grid(False) + + +visualize_classification(model, dataset.data, dataset.label) +plt.show() + +# %% [markdown] +# The decision boundaries might not look exactly as in the figure in the preamble of this section which can be caused by running it on CPU or a different GPU architecture. +# Nevertheless, the result on the accuracy metric should be the approximately the same. + +# %% [markdown] +# ## Additional features we didn't get to discuss yet +# +# Finally, you are all set to start with your own PyTorch project! +# In summary, we have looked at how we can build neural networks in PyTorch, and train and test them on data. +# However, there is still much more to PyTorch we haven't discussed yet. +# In the comming series of Jupyter notebooks, we will discover more and more functionalities of PyTorch, so that you also get familiar to PyTorch concepts beyond the basics. +# If you are already interested in learning more of PyTorch, we recommend the official [tutorial website](https://pytorch.org/tutorials/) that contains many tutorials on various topics. +# Especially logging with Tensorboard ([tutorial +# here](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html)) +# is a good practice that we will explore from Tutorial 5 on. diff --git a/course_UvA-DL/01-introduction-to-pytorch/comparison_CPU_GPU.png b/course_UvA-DL/01-introduction-to-pytorch/comparison_CPU_GPU.png new file mode 100644 index 0000000..b7d94f9 Binary files /dev/null and b/course_UvA-DL/01-introduction-to-pytorch/comparison_CPU_GPU.png differ diff --git a/course_UvA-DL/01-introduction-to-pytorch/continuous_xor.svg b/course_UvA-DL/01-introduction-to-pytorch/continuous_xor.svg new file mode 100644 index 0000000..12bfd7f --- /dev/null +++ b/course_UvA-DL/01-introduction-to-pytorch/continuous_xor.svg @@ -0,0 +1 @@ + diff --git a/course_UvA-DL/01-introduction-to-pytorch/pytorch_computation_graph.svg b/course_UvA-DL/01-introduction-to-pytorch/pytorch_computation_graph.svg new file mode 100644 index 0000000..912788a --- /dev/null +++ b/course_UvA-DL/01-introduction-to-pytorch/pytorch_computation_graph.svg @@ -0,0 +1 @@ +x2abc3y diff --git a/course_UvA-DL/01-introduction-to-pytorch/small_neural_network.svg b/course_UvA-DL/01-introduction-to-pytorch/small_neural_network.svg new file mode 100644 index 0000000..4a55eac --- /dev/null +++ b/course_UvA-DL/01-introduction-to-pytorch/small_neural_network.svg @@ -0,0 +1 @@ +x1x2 diff --git a/course_UvA-DL/02-activation-functions/.meta.yml b/course_UvA-DL/02-activation-functions/.meta.yml new file mode 100644 index 0000000..8d6392a --- /dev/null +++ b/course_UvA-DL/02-activation-functions/.meta.yml @@ -0,0 +1,20 @@ +title: "Tutorial 2: Activation Functions" +author: Phillip Lippe +created: 2021-08-27 +updated: 2023-03-14 +license: CC BY-SA +description: | + In this tutorial, we will take a closer look at (popular) activation functions and investigate their effect on optimization properties in neural networks. + Activation functions are a crucial part of deep learning models as they add the non-linearity to neural networks. + There is a great variety of activation functions in the literature, and some are more beneficial than others. + The goal of this tutorial is to show the importance of choosing a good activation function (and how to do so), and what problems might occur if we don't. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/02-activation-functions/.thumb.jpg b/course_UvA-DL/02-activation-functions/.thumb.jpg new file mode 100644 index 0000000..1b21f50 Binary files /dev/null and b/course_UvA-DL/02-activation-functions/.thumb.jpg differ diff --git a/course_UvA-DL/02-activation-functions/Activation_Functions.py b/course_UvA-DL/02-activation-functions/Activation_Functions.py new file mode 100644 index 0000000..a6dd6d1 --- /dev/null +++ b/course_UvA-DL/02-activation-functions/Activation_Functions.py @@ -0,0 +1,805 @@ +# %% [markdown] +#
+# Before we start, we import our standard libraries and set up basic functions: + +# %% +import json +import math +import os +import urllib.request +import warnings +from urllib.error import HTTPError + +import matplotlib.pyplot as plt + +# %matplotlib inline +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from torchvision import transforms +from torchvision.datasets import FashionMNIST +from tqdm.notebook import tqdm + +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +sns.set() + +# %% [markdown] +# We will define a function to set a seed on all libraries we might interact with in this tutorial (here numpy and torch). +# This allows us to make our training reproducible. +# However, note that in contrast to the CPU, the same seed on different GPU architectures can give different results. +# All models here have been trained on an NVIDIA GTX1080Ti. +# +# Additionally, the following cell defines two paths: `DATASET_PATH` and `CHECKPOINT_PATH`. +# The dataset path is the directory where we will download datasets used in the notebooks. +# It is recommended to store all datasets from PyTorch in one joined directory to prevent duplicate downloads. +# The checkpoint path is the directory where we will store trained model weights and additional files. +# The needed files will be automatically downloaded. +# In case you are on Google Colab, it is recommended to change the +# directories to start from the current directory (i.e. remove `../` for +# both dataset and checkpoint path). + +# %% +# Path to the folder where the datasets are/should be downloaded (e.g. MNIST) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/Activation_Functions/") + + +# Function for setting the seed +def set_seed(seed): + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): # GPU operation have separate seed + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +set_seed(42) + +# Additionally, some operations on a GPU are implemented stochastic for efficiency +# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# Fetching the device that will be used throughout this notebook +device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0") +print("Using device", device) + +# %% [markdown] +# The following cell downloads all pretrained models we will use in this notebook. +# The files are stored on a separate [repository](https://github.com/phlippe/saved_models) to reduce the size of the notebook repository, especially for building the documentation on ReadTheDocs. +# In case the download below fails, you can download the models from a [Google Drive folder](https://drive.google.com/drive/folders/1sFpZUpDJVjiYEvIqISqfkFizfsTnPf4s?usp=sharing). +# Please let me (Phillip) know if an error occurs so it can be fixed for all students. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial3/" +# Files to download +pretrained_files = [ + "FashionMNIST_elu.config", + "FashionMNIST_elu.tar", + "FashionMNIST_leakyrelu.config", + "FashionMNIST_leakyrelu.tar", + "FashionMNIST_relu.config", + "FashionMNIST_relu.tar", + "FashionMNIST_sigmoid.config", + "FashionMNIST_sigmoid.tar", + "FashionMNIST_swish.config", + "FashionMNIST_swish.tar", + "FashionMNIST_tanh.config", + "FashionMNIST_tanh.tar", +] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print(f"Downloading {file_url}...") + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## Common activation functions + +# %% [markdown] +# As a first step, we will implement some common activation functions by ourselves. +# Of course, most of them can also be found in the `torch.nn` package (see the [documentation](https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity) for an overview). +# However, we'll write our own functions here for a better understanding and insights. +# +# For an easier time of comparing various activation functions, we start +# with defining a base class from which all our future modules will +# inherit: + + +# %% +class ActivationFunction(nn.Module): + def __init__(self): + super().__init__() + self.name = self.__class__.__name__ + self.config = {"name": self.name} + + +# %% [markdown] +# Every activation function will be an `nn.Module` so that we can integrate them nicely in a network. +# We will use the `config` dictionary to store adjustable parameters for some activation functions. +# +# Next, we implement two of the "oldest" activation functions that are still commonly used for various tasks: sigmoid and tanh. +# Both the sigmoid and tanh activation can be also found as PyTorch functions (`torch.sigmoid`, `torch.tanh`) or as modules (`nn.Sigmoid`, `nn.Tanh`). +# Here, we implement them by hand: + + +# %% +class Sigmoid(ActivationFunction): + def forward(self, x): + return 1 / (1 + torch.exp(-x)) + + +class Tanh(ActivationFunction): + def forward(self, x): + x_exp, neg_x_exp = torch.exp(x), torch.exp(-x) + return (x_exp - neg_x_exp) / (x_exp + neg_x_exp) + + +# %% [markdown] +# Another popular activation function that has allowed the training of deeper networks, is the Rectified Linear Unit (ReLU). +# Despite its simplicity of being a piecewise linear function, ReLU has one major benefit compared to sigmoid and tanh: a strong, stable gradient for a large range of values. +# Based on this idea, a lot of variations of ReLU have been proposed, of which we will implement the following three: LeakyReLU, ELU, and Swish. +# LeakyReLU replaces the zero settings in the negative part with a smaller slope to allow gradients to flow also in this part of the input. +# Similarly, ELU replaces the negative part with an exponential decay. +# The third, most recently proposed activation function is Swish, which is actually the result of a large experiment with the purpose of finding the "optimal" activation function. +# Compared to the other activation functions, Swish is both smooth and non-monotonic (i.e. contains a change of sign in the gradient). +# This has been shown to prevent dead neurons as in standard ReLU activation, especially for deep networks. +# If interested, a more detailed discussion of the benefits of Swish can be found in [this paper](https://arxiv.org/abs/1710.05941) [1]. +# +# Let's implement the four activation functions below: + + +# %% +class ReLU(ActivationFunction): + def forward(self, x): + return x * (x > 0).float() + + +class LeakyReLU(ActivationFunction): + def __init__(self, alpha=0.1): + super().__init__() + self.config["alpha"] = alpha + + def forward(self, x): + return torch.where(x > 0, x, self.config["alpha"] * x) + + +class ELU(ActivationFunction): + def forward(self, x): + return torch.where(x > 0, x, torch.exp(x) - 1) + + +class Swish(ActivationFunction): + def forward(self, x): + return x * torch.sigmoid(x) + + +# %% [markdown] +# For later usage, we summarize all our activation functions in a dictionary mapping the name to the class object. +# In case you implement a new activation function by yourself, add it here to include it in future comparisons as well: + +# %% +act_fn_by_name = {"sigmoid": Sigmoid, "tanh": Tanh, "relu": ReLU, "leakyrelu": LeakyReLU, "elu": ELU, "swish": Swish} + +# %% [markdown] +# ### Visualizing activation functions +# +# To get an idea of what each activation function actually does, we will visualize them in the following. +# Next to the actual activation value, the gradient of the function is an important aspect as it is crucial for optimizing the neural network. +# PyTorch allows us to compute the gradients simply by calling the `backward` function: + + +# %% +def get_grads(act_fn, x): + """Computes the gradients of an activation function at specified positions. + + Args: + act_fn: An object of the class "ActivationFunction" with an implemented forward pass. + x: 1D input tensor. + + Returns: + A tensor with the same size of x containing the gradients of act_fn at x. + """ + x = x.clone().requires_grad_() # Mark the input as tensor for which we want to store gradients + out = act_fn(x) + out.sum().backward() # Summing results in an equal gradient flow to each element in x + return x.grad # Accessing the gradients of x by "x.grad" + + +# %% [markdown] +# Now we can visualize all our activation functions including their gradients: + + +# %% +def vis_act_fn(act_fn, ax, x): + # Run activation function + y = act_fn(x) + y_grads = get_grads(act_fn, x) + # Push x, y and gradients back to cpu for plotting + x, y, y_grads = x.cpu().numpy(), y.cpu().numpy(), y_grads.cpu().numpy() + # Plotting + ax.plot(x, y, linewidth=2, label="ActFn") + ax.plot(x, y_grads, linewidth=2, label="Gradient") + ax.set_title(act_fn.name) + ax.legend() + ax.set_ylim(-1.5, x.max()) + + +# Add activation functions if wanted +act_fns = [act_fn() for act_fn in act_fn_by_name.values()] +x = torch.linspace(-5, 5, 1000) # Range on which we want to visualize the activation functions +# Plotting +cols = 2 +rows = math.ceil(len(act_fns) / float(cols)) +fig, ax = plt.subplots(rows, cols, figsize=(cols * 4, rows * 4)) +for i, act_fn in enumerate(act_fns): + vis_act_fn(act_fn, ax[divmod(i, cols)], x) +fig.subplots_adjust(hspace=0.3) +plt.show() + +# %% [markdown] +# ## Analysing the effect of activation functions +#
+ + +# %% [markdown] +# After implementing and visualizing the activation functions, we are aiming to gain insights into their effect. +# We do this by using a simple neural network trained on +# [FashionMNIST](https://github.com/zalandoresearch/fashion-mnist) and +# examine various aspects of the model, including the performance and +# gradient flow. + +# %% [markdown] +# ### Setup + +# %% [markdown] +# Firstly, let's set up a neural network. +# The chosen network views the images as 1D tensors and pushes them through a sequence of linear layers and a specified activation function. +# Feel free to experiment with other network architectures. + + +# %% +class BaseNetwork(nn.Module): + def __init__(self, act_fn, input_size=784, num_classes=10, hidden_sizes=[512, 256, 256, 128]): + """Base Network. + + Args: + act_fn: Object of the activation function that should be used as non-linearity in the network. + input_size: Size of the input images in pixels + num_classes: Number of classes we want to predict + hidden_sizes: A list of integers specifying the hidden layer sizes in the NN + """ + super().__init__() + + # Create the network based on the specified hidden sizes + layers = [] + layer_sizes = [input_size] + hidden_sizes + layer_size_last = layer_sizes[0] + for layer_size in layer_sizes[1:]: + layers += [nn.Linear(layer_size_last, layer_size), act_fn] + layer_size_last = layer_size + layers += [nn.Linear(layer_sizes[-1], num_classes)] + # nn.Sequential summarizes a list of modules into a single module, applying them in sequence + self.layers = nn.Sequential(*layers) + + # We store all hyperparameters in a dictionary for saving and loading of the model + self.config = { + "act_fn": act_fn.config, + "input_size": input_size, + "num_classes": num_classes, + "hidden_sizes": hidden_sizes, + } + + def forward(self, x): + x = x.view(x.size(0), -1) # Reshape images to a flat vector + out = self.layers(x) + return out + + +# %% [markdown] +# We also add functions for loading and saving the model. +# The hyperparameters are stored in a configuration file (simple json file): + + +# %% +def _get_config_file(model_path, model_name): + # Name of the file for storing hyperparameter details + return os.path.join(model_path, model_name + ".config") + + +def _get_model_file(model_path, model_name): + # Name of the file for storing network parameters + return os.path.join(model_path, model_name + ".tar") + + +def load_model(model_path, model_name, net=None): + """Loads a saved model from disk. + + Args: + model_path: Path of the checkpoint directory + model_name: Name of the model (str) + net: (Optional) If given, the state dict is loaded into this model. Otherwise, a new model is created. + """ + config_file, model_file = _get_config_file(model_path, model_name), _get_model_file(model_path, model_name) + assert os.path.isfile( + config_file + ), f'Could not find the config file "{config_file}". Are you sure this is the correct path and you have your model config stored here?' + assert os.path.isfile( + model_file + ), f'Could not find the model file "{model_file}". Are you sure this is the correct path and you have your model stored here?' + with open(config_file) as f: + config_dict = json.load(f) + if net is None: + act_fn_name = config_dict["act_fn"].pop("name").lower() + act_fn = act_fn_by_name[act_fn_name](**config_dict.pop("act_fn")) + net = BaseNetwork(act_fn=act_fn, **config_dict) + net.load_state_dict(torch.load(model_file, map_location=device)) + return net + + +def save_model(model, model_path, model_name): + """Given a model, we save the state_dict and hyperparameters. + + Args: + model: Network object to save parameters from + model_path: Path of the checkpoint directory + model_name: Name of the model (str) + """ + config_dict = model.config + os.makedirs(model_path, exist_ok=True) + config_file, model_file = _get_config_file(model_path, model_name), _get_model_file(model_path, model_name) + with open(config_file, "w") as f: + json.dump(config_dict, f) + torch.save(model.state_dict(), model_file) + + +# %% [markdown] +# We also set up the dataset we want to train it on, namely [FashionMNIST](https://github.com/zalandoresearch/fashion-mnist). +# FashionMNIST is a more complex version of MNIST and contains black-and-white images of clothes instead of digits. +# The 10 classes include trousers, coats, shoes, bags and more. +# To load this dataset, we will make use of yet another PyTorch package, namely `torchvision` ([documentation](https://pytorch.org/vision/stable/index.html)). +# The `torchvision` package consists of popular datasets, model architectures, and common image transformations for computer vision. +# We will use the package for many of the notebooks in this course to simplify our dataset handling. +# +# Let's load the dataset below, and visualize a few images to get an impression of the data. + +# %% + +# Transformations applied on each image => first make them a tensor, then normalize them in the range -1 to 1 +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) + +# Loading the training dataset. We need to split it into a training and validation part +train_dataset = FashionMNIST(root=DATASET_PATH, train=True, transform=transform, download=True) +train_set, val_set = torch.utils.data.random_split(train_dataset, [50000, 10000]) + +# Loading the test set +test_set = FashionMNIST(root=DATASET_PATH, train=False, transform=transform, download=True) + +# %% [markdown] +# We define a set of data loaders that we can use for various purposes later. +# Note that for actually training a model, we will use different data loaders +# with a lower batch size. + +# %% +train_loader = data.DataLoader(train_set, batch_size=1024, shuffle=True, drop_last=False) +val_loader = data.DataLoader(val_set, batch_size=1024, shuffle=False, drop_last=False) +test_loader = data.DataLoader(test_set, batch_size=1024, shuffle=False, drop_last=False) + +# %% +exmp_imgs = [train_set[i][0] for i in range(16)] +# Organize the images into a grid for nicer visualization +img_grid = torchvision.utils.make_grid(torch.stack(exmp_imgs, dim=0), nrow=4, normalize=True, pad_value=0.5) +img_grid = img_grid.permute(1, 2, 0) + +plt.figure(figsize=(8, 8)) +plt.title("FashionMNIST examples") +plt.imshow(img_grid) +plt.axis("off") +plt.show() +plt.close() + +# %% [markdown] +# ### Visualizing the gradient flow after initialization +# +# As mentioned previously, one important aspect of activation functions is how they propagate gradients through the network. +# Imagine we have a very deep neural network with more than 50 layers. +# The gradients for the input layer, i.e. the very first layer, have passed >50 times the activation function, but we still want them to be of a reasonable size. +# If the gradient through the activation function is (in expectation) considerably smaller than 1, our gradients will vanish until they reach the input layer. +# If the gradient through the activation function is larger than 1, the gradients exponentially increase and might explode. +# +# To get a feeling of how every activation function influences the +# gradients, we can look at a freshly initialized network and measure the +# gradients for each parameter for a batch of 256 images: + + +# %% +def visualize_gradients(net, color="C0"): + """Visualize gradients. + + Args: + net: Object of class BaseNetwork + color: Color in which we want to visualize the histogram (for easier separation of activation functions) + """ + net.eval() + small_loader = data.DataLoader(train_set, batch_size=256, shuffle=False) + imgs, labels = next(iter(small_loader)) + imgs, labels = imgs.to(device), labels.to(device) + + # Pass one batch through the network, and calculate the gradients for the weights + net.zero_grad() + preds = net(imgs) + loss = F.cross_entropy(preds, labels) + loss.backward() + # We limit our visualization to the weight parameters and exclude the bias to reduce the number of plots + grads = { + name: params.grad.data.view(-1).cpu().clone().numpy() + for name, params in net.named_parameters() + if "weight" in name + } + net.zero_grad() + + # Plotting + columns = len(grads) + fig, ax = plt.subplots(1, columns, figsize=(columns * 3.5, 2.5)) + fig_index = 0 + for key in grads: + key_ax = ax[fig_index % columns] + sns.histplot(data=grads[key], bins=30, ax=key_ax, color=color, kde=True) + key_ax.set_title(str(key)) + key_ax.set_xlabel("Grad magnitude") + fig_index += 1 + fig.suptitle( + f"Gradient magnitude distribution for activation function {net.config['act_fn']['name']}", fontsize=14, y=1.05 + ) + fig.subplots_adjust(wspace=0.45) + plt.show() + plt.close() + + +# %% +# Seaborn prints warnings if histogram has small values. We can ignore them for now +warnings.filterwarnings("ignore") +# Create a plot for every activation function +for i, act_fn_name in enumerate(act_fn_by_name): + # Setting the seed ensures that we have the same weight initialization for each activation function + set_seed(42) + act_fn = act_fn_by_name[act_fn_name]() + net_actfn = BaseNetwork(act_fn=act_fn).to(device) + visualize_gradients(net_actfn, color=f"C{i}") + +# %% [markdown] +# The sigmoid activation function shows a clearly undesirable behavior. +# While the gradients for the output layer are very large with up to 0.1, the input layer has the lowest gradient norm across all activation functions with only 1e-5. +# This is due to its small maximum gradient of 1/4, and finding a suitable learning rate across all layers is not possible in this setup. +# All the other activation functions show to have similar gradient norms across all layers. +# Interestingly, the ReLU activation has a spike around 0 which is caused by its zero-part on the left, and dead neurons (we will take a closer look at this later on). +# +# Note that additionally to the activation, the initialization of the weight parameters can be crucial. +# By default, PyTorch uses the [Kaiming](https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.kaiming_uniform_) initialization for linear layers optimized for Tanh activations. +# In Tutorial 4, we will take a closer look at initialization, but assume +# for now that the Kaiming initialization works for all activation +# functions reasonably well. + +# %% [markdown] +# ### Training a model +# +# Next, we want to train our model with different activation functions on FashionMNIST and compare the gained performance. +# All in all, our final goal is to achieve the best possible performance on a dataset of our choice. +# Therefore, we write a training loop in the next cell including a +# validation after every epoch and a final test on the best model: + + +# %% +def train_model(net, model_name, max_epochs=50, patience=7, batch_size=256, overwrite=False): + """Train a model on the training set of FashionMNIST. + + Args: + net: Object of BaseNetwork + model_name: (str) Name of the model, used for creating the checkpoint names + max_epochs: Number of epochs we want to (maximally) train for + patience: If the performance on the validation set has not improved for #patience epochs, we stop training early + batch_size: Size of batches used in training + overwrite: Determines how to handle the case when there already exists a checkpoint. If True, it will be overwritten. Otherwise, we skip training. + """ + file_exists = os.path.isfile(_get_model_file(CHECKPOINT_PATH, model_name)) + if file_exists and not overwrite: + print("Model file already exists. Skipping training...") + else: + if file_exists: + print("Model file exists, but will be overwritten...") + + # Defining optimizer, loss and data loader + optimizer = optim.SGD(net.parameters(), lr=1e-2, momentum=0.9) # Default parameters, feel free to change + loss_module = nn.CrossEntropyLoss() + train_loader_local = data.DataLoader( + train_set, batch_size=batch_size, shuffle=True, drop_last=True, pin_memory=True + ) + + val_scores = [] + best_val_epoch = -1 + for epoch in range(max_epochs): + ############ + # Training # + ############ + net.train() + true_preds, count = 0.0, 0 + for imgs, labels in tqdm(train_loader_local, desc=f"Epoch {epoch+1}", leave=False): + imgs, labels = imgs.to(device), labels.to(device) # To GPU + optimizer.zero_grad() # Zero-grad can be placed anywhere before "loss.backward()" + preds = net(imgs) + loss = loss_module(preds, labels) + loss.backward() + optimizer.step() + # Record statistics during training + true_preds += (preds.argmax(dim=-1) == labels).sum() + count += labels.shape[0] + train_acc = true_preds / count + + ############## + # Validation # + ############## + val_acc = test_model(net, val_loader) + val_scores.append(val_acc) + print( + f"[Epoch {epoch+1:2i}] Training accuracy: {train_acc*100.0:05.2f}%, Validation accuracy: {val_acc*100.0:05.2f}%" + ) + + if len(val_scores) == 1 or val_acc > val_scores[best_val_epoch]: + print("\t (New best performance, saving model...)") + save_model(net, CHECKPOINT_PATH, model_name) + best_val_epoch = epoch + elif best_val_epoch <= epoch - patience: + print(f"Early stopping due to no improvement over the last {patience} epochs") + break + + # Plot a curve of the validation accuracy + plt.plot([i for i in range(1, len(val_scores) + 1)], val_scores) + plt.xlabel("Epochs") + plt.ylabel("Validation accuracy") + plt.title(f"Validation performance of {model_name}") + plt.show() + plt.close() + + load_model(CHECKPOINT_PATH, model_name, net=net) + test_acc = test_model(net, test_loader) + print((f" Test accuracy: {test_acc*100.0:4.2f}% ").center(50, "=") + "\n") + return test_acc + + +def test_model(net, data_loader): + """Test a model on a specified dataset. + + Args: + net: Trained model of type BaseNetwork + data_loader: DataLoader object of the dataset to test on (validation or test) + """ + net.eval() + true_preds, count = 0.0, 0 + for imgs, labels in data_loader: + imgs, labels = imgs.to(device), labels.to(device) + with torch.no_grad(): + preds = net(imgs).argmax(dim=-1) + true_preds += (preds == labels).sum().item() + count += labels.shape[0] + test_acc = true_preds / count + return test_acc + + +# %% [markdown] +# We train one model for each activation function. +# We recommend using the pretrained models to save time if you are running this notebook on CPU. + +# %% +for act_fn_name in act_fn_by_name: + print(f"Training BaseNetwork with {act_fn_name} activation...") + set_seed(42) + act_fn = act_fn_by_name[act_fn_name]() + net_actfn = BaseNetwork(act_fn=act_fn).to(device) + train_model(net_actfn, f"FashionMNIST_{act_fn_name}", overwrite=False) + +# %% [markdown] +# Not surprisingly, the model using the sigmoid activation function shows to fail and does not improve upon random performance (10 classes => 1/10 for random chance). +# +# All the other activation functions gain similar performance. +# To have a more accurate conclusion, we would have to train the models for multiple seeds and look at the averages. +# However, the "optimal" activation function also depends on many other factors (hidden sizes, number of layers, type of layers, task, dataset, optimizer, learning rate, etc.) +# so that a thorough grid search would not be useful in our case. +# In the literature, activation functions that have shown to work well +# with deep networks are all types of ReLU functions we experiment with +# here, with small gains for specific activation functions in specific +# networks. + +# %% [markdown] +# ### Visualizing the activation distribution + +# %% [markdown] +# After we have trained the models, we can look at the actual activation values that find inside the model. +# For instance, how many neurons are set to zero in ReLU? +# Where do we find most values in Tanh? +# To answer these questions, we can write a simple function which takes a +# trained model, applies it to a batch of images, and plots the histogram +# of the activations inside the network: + + +# %% +def visualize_activations(net, color="C0"): + activations = {} + + net.eval() + small_loader = data.DataLoader(train_set, batch_size=1024) + imgs, labels = next(iter(small_loader)) + with torch.no_grad(): + layer_index = 0 + imgs = imgs.to(device) + imgs = imgs.view(imgs.size(0), -1) + # We need to manually loop through the layers to save all activations + for layer_index, layer in enumerate(net.layers[:-1]): + imgs = layer(imgs) + activations[layer_index] = imgs.view(-1).cpu().numpy() + + # Plotting + columns = 4 + rows = math.ceil(len(activations) / columns) + fig, ax = plt.subplots(rows, columns, figsize=(columns * 2.7, rows * 2.5)) + fig_index = 0 + for key in activations: + key_ax = ax[fig_index // columns][fig_index % columns] + sns.histplot(data=activations[key], bins=50, ax=key_ax, color=color, kde=True, stat="density") + key_ax.set_title(f"Layer {key} - {net.layers[key].__class__.__name__}") + fig_index += 1 + fig.suptitle(f"Activation distribution for activation function {net.config['act_fn']['name']}", fontsize=14) + fig.subplots_adjust(hspace=0.4, wspace=0.4) + plt.show() + plt.close() + + +# %% +for i, act_fn_name in enumerate(act_fn_by_name): + net_actfn = load_model(model_path=CHECKPOINT_PATH, model_name=f"FashionMNIST_{act_fn_name}").to(device) + visualize_activations(net_actfn, color=f"C{i}") + +# %% [markdown] +# As the model with sigmoid activation was not able to train properly, the activations are also less informative and all gathered around 0.5 (the activation at input 0). +# +# The tanh shows a more diverse behavior. +# While for the input layer we experience a larger amount of neurons to be close to -1 and 1, where the gradients are close to zero, the activations in the two consecutive layers are closer to zero. +# This is probably because the input layers look for specific features in the input image, and the consecutive layers combine those together. +# The activations for the last layer are again more biased to the extreme points because the classification layer can be seen as a weighted average of those values (the gradients push the activations to those extremes). +# +# The ReLU has a strong peak at 0, as we initially expected. +# The effect of having no gradients for negative values is that the network does not have a Gaussian-like distribution after the linear layers, but a longer tail towards the positive values. +# The LeakyReLU shows a very similar behavior while ELU follows again a more Gaussian-like distribution. +# The Swish activation seems to lie in between, although it is worth noting that Swish uses significantly higher values than other activation functions (up to 20). +# +# As all activation functions show slightly different behavior although +# obtaining similar performance for our simple network, it becomes +# apparent that the selection of the "optimal" activation function really +# depends on many factors, and is not the same for all possible networks. + +# %% [markdown] +# ### Finding dead neurons in ReLU networks + +# %% [markdown] +# One known drawback of the ReLU activation is the occurrence of "dead neurons", i.e. neurons with no gradient for any training input. +# The issue of dead neurons is that as no gradient is provided for the layer, we cannot train the parameters of this neuron in the previous layer to obtain output values besides zero. +# For dead neurons to happen, the output value of a specific neuron of the linear layer before the ReLU has to be negative for all input images. +# Considering the large number of neurons we have in a neural network, it is not unlikely for this to happen. +# +# To get a better understanding of how much of a problem this is, and when we need to be careful, we will measure how many dead neurons different networks have. +# For this, we implement a function which runs the network on the whole +# training set and records whether a neuron is exactly 0 for all data +# points or not: + + +# %% +@torch.no_grad() +def measure_number_dead_neurons(net): + """Function to measure the number of dead neurons in a trained neural network. + + For each neuron, we create a boolean variable initially set to 1. If it has an activation unequals 0 at any time, we + set this variable to 0. After running through the whole training set, only dead neurons will have a 1. + """ + neurons_dead = [ + torch.ones(layer.weight.shape[0], device=device, dtype=torch.bool) + for layer in net.layers[:-1] + if isinstance(layer, nn.Linear) + ] # Same shapes as hidden size in BaseNetwork + + net.eval() + for imgs, labels in tqdm(train_loader, leave=False): # Run through whole training set + layer_index = 0 + imgs = imgs.to(device) + imgs = imgs.view(imgs.size(0), -1) + for layer in net.layers[:-1]: + imgs = layer(imgs) + if isinstance(layer, ActivationFunction): + # Are all activations == 0 in the batch, and we did not record the opposite in the last batches? + neurons_dead[layer_index] = torch.logical_and(neurons_dead[layer_index], (imgs == 0).all(dim=0)) + layer_index += 1 + number_neurons_dead = [t.sum().item() for t in neurons_dead] + print("Number of dead neurons:", number_neurons_dead) + print( + "In percentage:", + ", ".join( + [f"{(100.0 * num_dead / tens.shape[0]):4.2f}%" for tens, num_dead in zip(neurons_dead, number_neurons_dead)] + ), + ) + + +# %% [markdown] +# First, we can measure the number of dead neurons for an untrained network: + +# %% +set_seed(42) +net_relu = BaseNetwork(act_fn=ReLU()).to(device) +measure_number_dead_neurons(net_relu) + +# %% [markdown] +# We see that only a minor amount of neurons are dead, but that they increase with the depth of the layer. +# However, this is not a problem for the small number of dead neurons we have as the input to later layers is changed due to updates to the weights of previous layers. +# Therefore, dead neurons in later layers can potentially become "alive"/active again. +# +# How does this look like for a trained network (with the same initialization)? + +# %% +net_relu = load_model(model_path=CHECKPOINT_PATH, model_name="FashionMNIST_relu").to(device) +measure_number_dead_neurons(net_relu) + +# %% [markdown] +# The number of dead neurons indeed decreased in the later layers. +# However, it should be noted that dead neurons are especially problematic in the input layer. +# As the input does not change over epochs (the training set is kept as it is), training the network cannot turn those neurons back active. +# Still, the input data has usually a sufficiently high standard deviation to reduce the risk of dead neurons. +# +# Finally, we check how the number of dead neurons behaves with increasing layer depth. +# For instance, let's take the following 10-layer neural network: + +# %% +set_seed(42) +net_relu = BaseNetwork( + act_fn=ReLU(), + hidden_sizes=[256, 256, 256, 256, 256, 128, 128, 128, 128, 128], +).to(device) +measure_number_dead_neurons(net_relu) + +# %% [markdown] +# The number of dead neurons is significantly higher than before which harms the gradient flow especially in the first iterations. +# For instance, more than 56% of the neurons in the pre-last layer are dead which creates a considerable bottleneck. +# Hence, it is advisible to use other nonlinearities like Swish for very deep networks. + +# %% [markdown] +# ## Conclusion +# +# In this notebook, we have reviewed a set of six activation functions (sigmoid, tanh, ReLU, LeakyReLU, ELU, and Swish) in neural networks, and discussed how they influence the gradient distribution across layers. +# Sigmoid tends to fail deep neural networks as the highest gradient it provides is 0.25 leading to vanishing gradients in early layers. +# All ReLU-based activation functions have shown to perform well, and besides the original ReLU, do not have the issue of dead neurons. +# When implementing your own neural network, it is recommended to start +# with a ReLU-based network and select the specific activation function +# based on the properties of the network. + +# %% [markdown] +# ## References +# +# [1] Ramachandran, Prajit, Barret Zoph, and Quoc V. Le. +# "Searching for activation functions." +# arXiv preprint arXiv:1710.05941 (2017). +# [Paper link](https://arxiv.org/abs/1710.05941) diff --git a/course_UvA-DL/03-initialization-and-optimization/.meta.yml b/course_UvA-DL/03-initialization-and-optimization/.meta.yml new file mode 100644 index 0000000..dee86a0 --- /dev/null +++ b/course_UvA-DL/03-initialization-and-optimization/.meta.yml @@ -0,0 +1,24 @@ +title: "Tutorial 3: Initialization and Optimization" +author: Phillip Lippe +created: 2021-08-27 +updated: 2023-03-14 +license: CC BY-SA +tags: + - Image + - Initialization + - Optimizers +description: | + In this tutorial, we will review techniques for optimization and initialization of neural networks. + When increasing the depth of neural networks, there are various challenges we face. + Most importantly, we need to have a stable gradient flow through the network, as otherwise, we might encounter vanishing or exploding gradients. + This is why we will take a closer look at the following concepts: initialization and optimization. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/03-initialization-and-optimization/.thumb.jpg b/course_UvA-DL/03-initialization-and-optimization/.thumb.jpg new file mode 100644 index 0000000..e8d42d4 Binary files /dev/null and b/course_UvA-DL/03-initialization-and-optimization/.thumb.jpg differ diff --git a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py new file mode 100644 index 0000000..ea8788e --- /dev/null +++ b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py @@ -0,0 +1,1158 @@ +# %% [markdown] +#
+# In the first half of the notebook, we will review different initialization techniques, and go step by step from the simplest initialization to methods that are nowadays used in very deep networks. +# In the second half, we focus on optimization comparing the optimizers SGD, SGD with Momentum, and Adam. +# +# Let's start with importing our standard libraries: + +# %% +import copy +import json +import math +import os +import urllib.request +from urllib.error import HTTPError + +import lightning as L +import matplotlib.pyplot as plt + +# %matplotlib inline +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.data as data +from matplotlib import cm +from torchvision import transforms +from torchvision.datasets import FashionMNIST +from tqdm.notebook import tqdm + +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +sns.set() + +# %% [markdown] +# Instead of the `set_seed` function as in Tutorial 3, we can use Lightning's build-in function `L.seed_everything`. +# We will reuse the path variables `DATASET_PATH` and `CHECKPOINT_PATH` as in Tutorial 3. +# Adjust the paths if necessary. + +# %% +# Path to the folder where the datasets are/should be downloaded (e.g. MNIST) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/InitOptim/") + +# Seed everything +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# Fetching the device that will be used throughout this notebook +device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0") +print("Using device", device) + +# %% [markdown] +# In the last part of the notebook, we will train models using three different optimizers. +# The pretrained models for those are downloaded below. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial4/" +# Files to download +pretrained_files = [ + "FashionMNIST_SGD.config", + "FashionMNIST_SGD_results.json", + "FashionMNIST_SGD.tar", + "FashionMNIST_SGDMom.config", + "FashionMNIST_SGDMom_results.json", + "FashionMNIST_SGDMom.tar", + "FashionMNIST_Adam.config", + "FashionMNIST_Adam_results.json", + "FashionMNIST_Adam.tar", +] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print(f"Downloading {file_url}...") + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## Preparation + +# %% [markdown] +# Throughout this notebook, we will use a deep fully connected network, similar to our previous tutorial. +# We will also again apply the network to FashionMNIST, so you can relate to the results of Tutorial 3. +# We start by loading the FashionMNIST dataset: + +# %% + +# Transformations applied on each image => first make them a tensor, then normalize them with mean 0 and std 1 +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.2861,), (0.3530,))]) + +# Loading the training dataset. We need to split it into a training and validation part +train_dataset = FashionMNIST(root=DATASET_PATH, train=True, transform=transform, download=True) +train_set, val_set = torch.utils.data.random_split(train_dataset, [50000, 10000]) + +# Loading the test set +test_set = FashionMNIST(root=DATASET_PATH, train=False, transform=transform, download=True) + +# %% [markdown] +# We define a set of data loaders that we can use for various purposes later. +# Note that for actually training a model, we will use different data loaders +# with a lower batch size. + +# %% +train_loader = data.DataLoader(train_set, batch_size=1024, shuffle=True, drop_last=False) +val_loader = data.DataLoader(val_set, batch_size=1024, shuffle=False, drop_last=False) +test_loader = data.DataLoader(test_set, batch_size=1024, shuffle=False, drop_last=False) + +# %% [markdown] +# In comparison to the previous tutorial, we have changed the parameters of the normalization transformation `transforms.Normalize`. +# The normalization is now designed to give us an expected mean of 0 and a standard deviation of 1 across pixels. +# This will be particularly relevant for the discussion about initialization we will look at below, and hence we change it here. +# It should be noted that in most classification tasks, both normalization techniques (between -1 and 1 or mean 0 and stddev 1) have shown to work well. +# We can calculate the normalization parameters by determining the mean and standard deviation on the original images: + +# %% +print("Mean", (train_dataset.data.float() / 255.0).mean().item()) +print("Std", (train_dataset.data.float() / 255.0).std().item()) + +# %% [markdown] +# We can verify the transformation by looking at the statistics of a single batch: + +# %% +imgs, _ = next(iter(train_loader)) +print(f"Mean: {imgs.mean().item():5.3f}") +print(f"Standard deviation: {imgs.std().item():5.3f}") +print(f"Maximum: {imgs.max().item():5.3f}") +print(f"Minimum: {imgs.min().item():5.3f}") + +# %% [markdown] +# Note that the maximum and minimum are not 1 and -1 anymore, but shifted towards the positive values. +# This is because FashionMNIST contains a lot of black pixels, similar to MNIST. +# +# Next, we create a linear neural network. We use the same setup as in the previous tutorial. + + +# %% +class BaseNetwork(nn.Module): + def __init__(self, act_fn, input_size=784, num_classes=10, hidden_sizes=[512, 256, 256, 128]): + """Base Network. + + Args: + act_fn: Object of the activation function that should be used as non-linearity in the network. + input_size: Size of the input images in pixels + num_classes: Number of classes we want to predict + hidden_sizes: A list of integers specifying the hidden layer sizes in the NN + """ + super().__init__() + + # Create the network based on the specified hidden sizes + layers = [] + layer_sizes = [input_size] + hidden_sizes + for layer_index in range(1, len(layer_sizes)): + layers += [nn.Linear(layer_sizes[layer_index - 1], layer_sizes[layer_index]), act_fn] + layers += [nn.Linear(layer_sizes[-1], num_classes)] + # A module list registers a list of modules as submodules (e.g. for parameters) + self.layers = nn.ModuleList(layers) + + self.config = { + "act_fn": act_fn.__class__.__name__, + "input_size": input_size, + "num_classes": num_classes, + "hidden_sizes": hidden_sizes, + } + + def forward(self, x): + x = x.view(x.size(0), -1) + for layer in self.layers: + x = layer(x) + return x + + +# %% [markdown] +# For the activation functions, we make use of PyTorch's `torch.nn` library instead of implementing ourselves. +# However, we also define an `Identity` activation function. +# Although this activation function would significantly limit the +# network's modeling capabilities, we will use it in the first steps of +# our discussion about initialization (for simplicity). + + +# %% +class Identity(nn.Module): + def forward(self, x): + return x + + +act_fn_by_name = {"tanh": nn.Tanh, "relu": nn.ReLU, "identity": Identity} + +# %% [markdown] +# Finally, we define a few plotting functions that we will use for our discussions. +# These functions help us to (1) visualize the weight/parameter distribution inside a network, (2) visualize the gradients that the parameters at different layers receive, and (3) the activations, i.e. the output of the linear layers. +# The detailed code is not important, but feel free to take a closer look if interested. + +# %% +############################################################## + + +def plot_dists(val_dict, color="C0", xlabel=None, stat="count", use_kde=True): + columns = len(val_dict) + fig, ax = plt.subplots(1, columns, figsize=(columns * 3, 2.5)) + fig_index = 0 + for key in sorted(val_dict.keys()): + key_ax = ax[fig_index % columns] + sns.histplot( + val_dict[key], + ax=key_ax, + color=color, + bins=50, + stat=stat, + kde=use_kde and ((val_dict[key].max() - val_dict[key].min()) > 1e-8), + ) # Only plot kde if there is variance + hidden_dim_str = ( + r"(%i $\to$ %i)" % (val_dict[key].shape[1], val_dict[key].shape[0]) if len(val_dict[key].shape) > 1 else "" + ) + key_ax.set_title(f"{key} {hidden_dim_str}") + if xlabel is not None: + key_ax.set_xlabel(xlabel) + fig_index += 1 + fig.subplots_adjust(wspace=0.4) + return fig + + +############################################################## + + +def visualize_weight_distribution(model, color="C0"): + weights = {} + for name, param in model.named_parameters(): + if name.endswith(".bias"): + continue + key_name = f"Layer {name.split('.')[1]}" + weights[key_name] = param.detach().view(-1).cpu().numpy() + + # Plotting + fig = plot_dists(weights, color=color, xlabel="Weight vals") + fig.suptitle("Weight distribution", fontsize=14, y=1.05) + plt.show() + plt.close() + + +############################################################## + + +def visualize_gradients(model, color="C0", print_variance=False): + """ + Args: + net: Object of class BaseNetwork + color: Color in which we want to visualize the histogram (for easier separation of activation functions) + """ + model.eval() + small_loader = data.DataLoader(train_set, batch_size=1024, shuffle=False) + imgs, labels = next(iter(small_loader)) + imgs, labels = imgs.to(device), labels.to(device) + + # Pass one batch through the network, and calculate the gradients for the weights + model.zero_grad() + preds = model(imgs) + loss = F.cross_entropy(preds, labels) # Same as nn.CrossEntropyLoss, but as a function instead of module + loss.backward() + # We limit our visualization to the weight parameters and exclude the bias to reduce the number of plots + grads = { + name: params.grad.view(-1).cpu().clone().numpy() + for name, params in model.named_parameters() + if "weight" in name + } + model.zero_grad() + + # Plotting + fig = plot_dists(grads, color=color, xlabel="Grad magnitude") + fig.suptitle("Gradient distribution", fontsize=14, y=1.05) + plt.show() + plt.close() + + if print_variance: + for key in sorted(grads.keys()): + print(f"{key} - Variance: {np.var(grads[key])}") + + +############################################################## + + +def visualize_activations(model, color="C0", print_variance=False): + model.eval() + small_loader = data.DataLoader(train_set, batch_size=1024, shuffle=False) + imgs, labels = next(iter(small_loader)) + imgs, labels = imgs.to(device), labels.to(device) + + # Pass one batch through the network, and calculate the gradients for the weights + feats = imgs.view(imgs.shape[0], -1) + activations = {} + with torch.no_grad(): + for layer_index, layer in enumerate(model.layers): + feats = layer(feats) + if isinstance(layer, nn.Linear): + activations[f"Layer {layer_index}"] = feats.view(-1).detach().cpu().numpy() + + # Plotting + fig = plot_dists(activations, color=color, stat="density", xlabel="Activation vals") + fig.suptitle("Activation distribution", fontsize=14, y=1.05) + plt.show() + plt.close() + + if print_variance: + for key in sorted(activations.keys()): + print(f"{key} - Variance: {np.var(activations[key])}") + + +############################################################## + +# %% [markdown] +# ## Initialization +# +# Before starting our discussion about initialization, it should be noted that there exist many very good blog posts about the topic of neural network initialization (for example [deeplearning.ai](https://www.deeplearning.ai/ai-notes/initialization/), or a more [math-focused blog post](https://pouannes.github.io/blog/initialization)). +# In case something remains unclear after this tutorial, we recommend skimming through these blog posts as well. +# +# When initializing a neural network, there are a few properties we would like to have. +# First, the variance of the input should be propagated through the model to the last layer, so that we have a similar standard deviation for the output neurons. +# If the variance would vanish the deeper we go in our model, it becomes much harder to optimize the model as the input to the next layer is basically a single constant value. +# Similarly, if the variance increases, it is likely to explode (i.e. head to infinity) the deeper we design our model. +# The second property we look out for in initialization techniques is a gradient distribution with equal variance across layers. +# If the first layer receives much smaller gradients than the last layer, we will have difficulties in choosing an appropriate learning rate. +# +# As a starting point for finding a good method, we will analyze different initialization based on our linear neural network with no activation function (i.e. an identity). +# We do this because initializations depend on the specific activation +# function used in the network, and we can adjust the initialization +# schemes later on for our specific choice. + +# %% +model = BaseNetwork(act_fn=Identity()).to(device) + +# %% [markdown] +# ### Constant initialization +# +# The first initialization we can consider is to initialize all weights with the same constant value. +# Intuitively, setting all weights to zero is not a good idea as the propagated gradient will be zero. +# However, what happens if we set all weights to a value slightly larger or smaller than 0? +# To find out, we can implement a function for setting all parameters below and visualize the gradients. + + +# %% +def const_init(model, fill=0.0): + for name, param in model.named_parameters(): + param.data.fill_(fill) + + +const_init(model, fill=0.005) +visualize_gradients(model) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# As we can see, only the first and the last layer have diverse gradient distributions while the other three layers have the same gradient for all weights (note that this value is unequal 0, but often very close to it). +# Having the same gradient for parameters that have been initialized with the same values means that we will always have the same value for those parameters. +# This would make our layer useless and reduce our effective number of parameters to 1. +# Thus, we cannot use a constant initialization to train our networks. + +# %% [markdown] +# ### Constant variance +# +# From the experiment above, we have seen that a constant value is not working. +# So instead, how about we initialize the parameters by randomly sampling from a distribution like a Gaussian? +# The most intuitive way would be to choose one variance that is used for all layers in the network. +# Let's implement it below, and visualize the activation distribution across layers. + + +# %% +def var_init(model, std=0.01): + for name, param in model.named_parameters(): + param.data.normal_(mean=0.0, std=std) + + +var_init(model, std=0.01) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# The variance of the activation becomes smaller and smaller across layers, and almost vanishes in the last layer. +# Alternatively, we could use a higher standard deviation: + +# %% +var_init(model, std=0.1) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# With a higher standard deviation, the activations are likely to explode. +# You can play around with the specific standard deviation values, but it will be hard to find one that gives us a good activation distribution across layers and is very specific to our model. +# If we would change the hidden sizes or number of layers, you would have +# to search all over again, which is neither efficient nor recommended. + +# %% [markdown] +# ### How to find appropriate initialization values +# +# From our experiments above, we have seen that we need to sample the weights from a distribution, but are not sure which one exactly. +# As a next step, we will try to find the optimal initialization from the perspective of the activation distribution. +# For this, we state two requirements: +# +# 1. The mean of the activations should be zero +# 2. The variance of the activations should stay the same across every layer +# +# Suppose we want to design an initialization for the following layer: $y=Wx+b$ with $y\in\mathbb{R}^{d_y}$, $x\in\mathbb{R}^{d_x}$. +# Our goal is that the variance of each element of $y$ is the same as the input, i.e. $\text{Var}(y_i)=\text{Var}(x_i)=\sigma_x^{2}$, and that the mean is zero. +# We assume $x$ to also have a mean of zero, because, in deep neural networks, $y$ would be the input of another layer. +# This requires the bias and weight to have an expectation of 0. +# Actually, as $b$ is a single element per output neuron and is constant across different inputs, we set it to 0 overall. +# +# Next, we need to calculate the variance with which we need to initialize the weight parameters. +# Along the calculation, we will need to following variance rule: given two independent variables, the variance of their product is $\text{Var}(X\cdot Y) = \mathbb{E}(Y)^2\text{Var}(X) + \mathbb{E}(X)^2\text{Var}(Y) + \text{Var}(X)\text{Var}(Y) = \mathbb{E}(Y^2)\mathbb{E}(X^2)-\mathbb{E}(Y)^2\mathbb{E}(X)^2$ ($X$ and $Y$ are not refering to $x$ and $y$, but any random variable). +# +# The needed variance of the weights, $\text{Var}(w_{ij})$, is calculated as follows: +# +# $$ +# \begin{split} +# y_i & = \sum_{j} w_{ij}x_{j}\hspace{10mm}\text{Calculation of a single output neuron without bias}\\ +# \text{Var}(y_i) = \sigma_x^{2} & = \text{Var}\left(\sum_{j} w_{ij}x_{j}\right)\\ +# & = \sum_{j} \text{Var}(w_{ij}x_{j}) \hspace{10mm}\text{Inputs and weights are independent of each other}\\ +# & = \sum_{j} \text{Var}(w_{ij})\cdot\text{Var}(x_{j}) \hspace{10mm}\text{Variance rule (see above) with expectations being zero}\\ +# & = d_x \cdot \text{Var}(w_{ij})\cdot\text{Var}(x_{j}) \hspace{10mm}\text{Variance equal for all $d_x$ elements}\\ +# & = \sigma_x^{2} \cdot d_x \cdot \text{Var}(w_{ij})\\ +# \Rightarrow \text{Var}(w_{ij}) = \sigma_{W}^2 & = \frac{1}{d_x}\\ +# \end{split} +# $$ +# +# Thus, we should initialize the weight distribution with a variance of the inverse of the input dimension $d_x$. +# Let's implement it below and check whether this holds: + + +# %% +def equal_var_init(model): + for name, param in model.named_parameters(): + if name.endswith(".bias"): + param.data.fill_(0) + else: + param.data.normal_(std=1.0 / math.sqrt(param.shape[1])) + + +equal_var_init(model) +visualize_weight_distribution(model) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# As we expected, the variance stays indeed constant across layers. +# Note that our initialization does not restrict us to a normal distribution, but allows any other distribution with a mean of 0 and variance of $1/d_x$. +# You often see that a uniform distribution is used for initialization. +# A small benefit of using a uniform instead of a normal distribution is that we can exclude the chance of initializing very large or small weights. +# +# Besides the variance of the activations, another variance we would like to stabilize is the one of the gradients. +# This ensures a stable optimization for deep networks. +# It turns out that we can do the same calculation as above starting from $\Delta x=W\Delta y$, and come to the conclusion that we should initialize our layers with $1/d_y$ where $d_y$ is the number of output neurons. +# You can do the calculation as a practice, or check a thorough explanation in [this blog post](https://pouannes.github.io/blog/initialization). +# As a compromise between both constraints, [Glorot and Bengio (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf?hc_location=ufi) proposed to use the harmonic mean of both values. +# This leads us to the well-known Xavier initialization: +# +# $$W\sim \mathcal{N}\left(0,\frac{2}{d_x+d_y}\right)$$ +# +# If we use a uniform distribution, we would initialize the weights with: +# +# $$W\sim U\left[-\frac{\sqrt{6}}{\sqrt{d_x+d_y}}, \frac{\sqrt{6}}{\sqrt{d_x+d_y}}\right]$$ +# +# Let's shortly implement it and validate its effectiveness: + + +# %% +def xavier_init(model): + for name, param in model.named_parameters(): + if name.endswith(".bias"): + param.data.fill_(0) + else: + bound = math.sqrt(6) / math.sqrt(param.shape[0] + param.shape[1]) + param.data.uniform_(-bound, bound) + + +xavier_init(model) +visualize_gradients(model, print_variance=True) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# We see that the Xavier initialization balances the variance of gradients and activations. +# Note that the significantly higher variance for the output layer is due to the large difference of input and output dimension ($128$ vs $10$). +# However, we currently assumed the activation function to be linear. +# So what happens if we add a non-linearity? +# In a tanh-based network, a common assumption is that for small values during the initial steps in training, the $\tanh$ works as a linear function such that we don't have to adjust our calculation. +# We can check if that is the case for us as well: + +# %% +model = BaseNetwork(act_fn=nn.Tanh()).to(device) +xavier_init(model) +visualize_gradients(model, print_variance=True) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# Although the variance decreases over depth, it is apparent that the activation distribution becomes more focused on the low values. +# Therefore, our variance will stabilize around 0.25 if we would go even deeper. +# Hence, we can conclude that the Xavier initialization works well for Tanh networks. +# But what about ReLU networks? +# Here, we cannot take the previous assumption of the non-linearity becoming linear for small values. +# The ReLU activation function sets (in expectation) half of the inputs to 0 so that also the expectation of the input is not zero. +# However, as long as the expectation of $W$ is zero and $b=0$, the expectation of the output is zero. +# The part where the calculation of the ReLU initialization differs from the identity is when determining $\text{Var}(w_{ij}x_{j})$: +# +# $$\text{Var}(w_{ij}x_{j})=\underbrace{\mathbb{E}[w_{ij}^2]}_{=\text{Var}(w_{ij})}\mathbb{E}[x_{j}^2]-\underbrace{\mathbb{E}[w_{ij}]^2}_{=0}\mathbb{E}[x_{j}]^2=\text{Var}(w_{ij})\mathbb{E}[x_{j}^2]$$ +# +# If we assume now that $x$ is the output of a ReLU activation (from a previous layer, $x=max(0,\tilde{y})$), we can calculate the expectation as follows: +# +# +# $$ +# \begin{split} +# \mathbb{E}[x^2] & =\mathbb{E}[\max(0,\tilde{y})^2]\\ +# & =\frac{1}{2}\mathbb{E}[{\tilde{y}}^2]\hspace{2cm}\tilde{y}\text{ is zero-centered and symmetric}\\ +# & =\frac{1}{2}\text{Var}(\tilde{y}) +# \end{split}$$ +# +# Thus, we see that we have an additional factor of 1/2 in the equation, so that our desired weight variance becomes $2/d_x$. +# This gives us the Kaiming initialization (see [He, K. et al. +# (2015)](https://arxiv.org/pdf/1502.01852.pdf)). +# Note that the Kaiming initialization does not use the harmonic mean between input and output size. +# In their paper (Section 2.2, Backward Propagation, last paragraph), they argue that using $d_x$ or $d_y$ both lead to stable gradients throughout the network, and only depend on the overall input and output size of the network. +# Hence, we can use here only the input $d_x$: + + +# %% +def kaiming_init(model): + for name, param in model.named_parameters(): + if name.endswith(".bias"): + param.data.fill_(0) + elif name.startswith("layers.0"): # The first layer does not have ReLU applied on its input + param.data.normal_(0, 1 / math.sqrt(param.shape[1])) + else: + param.data.normal_(0, math.sqrt(2) / math.sqrt(param.shape[1])) + + +model = BaseNetwork(act_fn=nn.ReLU()).to(device) +kaiming_init(model) +visualize_gradients(model, print_variance=True) +visualize_activations(model, print_variance=True) + +# %% [markdown] +# The variance stays stable across layers. +# We can conclude that the Kaiming initialization indeed works well for ReLU-based networks. +# Note that for Leaky-ReLU etc., we have to slightly adjust the factor of $2$ in the variance as half of the values are not set to zero anymore. +# PyTorch provides a function to calculate this factor for many activation +# function, see `torch.nn.init.calculate_gain` +# ([link](https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.calculate_gain)). + +# %% [markdown] +# ## Optimization +# +#
+# +# Besides initialization, selecting a suitable optimization algorithm can be an important choice for deep neural networks. +# Before taking a closer look at them, we should define code for training the models. +# Most of the following code is copied from the previous tutorial, and only slightly altered to fit our needs. + + +# %% +def _get_config_file(model_path, model_name): + return os.path.join(model_path, model_name + ".config") + + +def _get_model_file(model_path, model_name): + return os.path.join(model_path, model_name + ".tar") + + +def _get_result_file(model_path, model_name): + return os.path.join(model_path, model_name + "_results.json") + + +def load_model(model_path, model_name, net=None): + config_file = _get_config_file(model_path, model_name) + model_file = _get_model_file(model_path, model_name) + assert os.path.isfile( + config_file + ), f'Could not find the config file "{config_file}". Are you sure this is the correct path and you have your model config stored here?' + assert os.path.isfile( + model_file + ), f'Could not find the model file "{model_file}". Are you sure this is the correct path and you have your model stored here?' + with open(config_file) as f: + config_dict = json.load(f) + if net is None: + act_fn_name = config_dict["act_fn"].pop("name").lower() + assert ( + act_fn_name in act_fn_by_name + ), f'Unknown activation function "{act_fn_name}". Please add it to the "act_fn_by_name" dict.' + act_fn = act_fn_by_name[act_fn_name]() + net = BaseNetwork(act_fn=act_fn, **config_dict) + net.load_state_dict(torch.load(model_file)) + return net + + +def save_model(model, model_path, model_name): + config_dict = model.config + os.makedirs(model_path, exist_ok=True) + config_file = _get_config_file(model_path, model_name) + model_file = _get_model_file(model_path, model_name) + with open(config_file, "w") as f: + json.dump(config_dict, f) + torch.save(model.state_dict(), model_file) + + +def train_model(net, model_name, optim_func, max_epochs=50, batch_size=256, overwrite=False): + """Train a model on the training set of FashionMNIST. + + Args: + net: Object of BaseNetwork + model_name: (str) Name of the model, used for creating the checkpoint names + max_epochs: Number of epochs we want to (maximally) train for + patience: If the performance on the validation set has not improved for #patience epochs, we stop training early + batch_size: Size of batches used in training + overwrite: Determines how to handle the case when there already exists a checkpoint. If True, it will be overwritten. Otherwise, we skip training. + """ + file_exists = os.path.isfile(_get_model_file(CHECKPOINT_PATH, model_name)) + if file_exists and not overwrite: + print(f'Model file of "{model_name}" already exists. Skipping training...') + with open(_get_result_file(CHECKPOINT_PATH, model_name)) as f: + results = json.load(f) + else: + if file_exists: + print("Model file exists, but will be overwritten...") + + # Defining optimizer, loss and data loader + optimizer = optim_func(net.parameters()) + loss_module = nn.CrossEntropyLoss() + train_loader_local = data.DataLoader( + train_set, batch_size=batch_size, shuffle=True, drop_last=True, pin_memory=True + ) + + results = None + val_scores = [] + train_losses, train_scores = [], [] + best_val_epoch = -1 + for epoch in range(max_epochs): + train_acc, val_acc, epoch_losses = epoch_iteration( + net, loss_module, optimizer, train_loader_local, val_loader, epoch + ) + train_scores.append(train_acc) + val_scores.append(val_acc) + train_losses += epoch_losses + + if len(val_scores) == 1 or val_acc > val_scores[best_val_epoch]: + print("\t (New best performance, saving model...)") + save_model(net, CHECKPOINT_PATH, model_name) + best_val_epoch = epoch + + if results is None: + load_model(CHECKPOINT_PATH, model_name, net=net) + test_acc = test_model(net, test_loader) + results = { + "test_acc": test_acc, + "val_scores": val_scores, + "train_losses": train_losses, + "train_scores": train_scores, + } + with open(_get_result_file(CHECKPOINT_PATH, model_name), "w") as f: + json.dump(results, f) + + # Plot a curve of the validation accuracy + sns.set() + plt.plot([i for i in range(1, len(results["train_scores"]) + 1)], results["train_scores"], label="Train") + plt.plot([i for i in range(1, len(results["val_scores"]) + 1)], results["val_scores"], label="Val") + plt.xlabel("Epochs") + plt.ylabel("Validation accuracy") + plt.ylim(min(results["val_scores"]), max(results["train_scores"]) * 1.01) + plt.title(f"Validation performance of {model_name}") + plt.legend() + plt.show() + plt.close() + + print((f" Test accuracy: {results['test_acc']*100.0:4.2f}% ").center(50, "=") + "\n") + return results + + +def epoch_iteration(net, loss_module, optimizer, train_loader_local, val_loader, epoch): + ############ + # Training # + ############ + net.train() + true_preds, count = 0.0, 0 + epoch_losses = [] + t = tqdm(train_loader_local, leave=False) + for imgs, labels in t: + imgs, labels = imgs.to(device), labels.to(device) + optimizer.zero_grad() + preds = net(imgs) + loss = loss_module(preds, labels) + loss.backward() + optimizer.step() + # Record statistics during training + true_preds += (preds.argmax(dim=-1) == labels).sum().item() + count += labels.shape[0] + t.set_description(f"Epoch {epoch+1}: loss={loss.item():4.2f}") + epoch_losses.append(loss.item()) + train_acc = true_preds / count + + ############## + # Validation # + ############## + val_acc = test_model(net, val_loader) + print( + f"[Epoch {epoch+1:2i}] Training accuracy: {train_acc*100.0:05.2f}%, Validation accuracy: {val_acc*100.0:05.2f}%" + ) + return train_acc, val_acc, epoch_losses + + +def test_model(net, data_loader): + """Test a model on a specified dataset. + + Args: + net: Trained model of type BaseNetwork + data_loader: DataLoader object of the dataset to test on (validation or test) + """ + net.eval() + true_preds, count = 0.0, 0 + for imgs, labels in data_loader: + imgs, labels = imgs.to(device), labels.to(device) + with torch.no_grad(): + preds = net(imgs).argmax(dim=-1) + true_preds += (preds == labels).sum().item() + count += labels.shape[0] + test_acc = true_preds / count + return test_acc + + +# %% [markdown] +# First, we need to understand what an optimizer actually does. +# The optimizer is responsible to update the network's parameters given the gradients. +# Hence, we effectively implement a function $w^{t} = f(w^{t-1}, g^{t}, ...)$ with $w$ being the parameters, and $g^{t} = \nabla_{w^{(t-1)}} \mathcal{L}^{(t)}$ the gradients at time step $t$. +# A common, additional parameter to this function is the learning rate, here denoted by $\eta$. +# Usually, the learning rate can be seen as the "step size" of the update. +# A higher learning rate means that we change the weights more in the direction of the gradients, a smaller means we take shorter steps. +# +# As most optimizers only differ in the implementation of $f$, we can define a template for an optimizer in PyTorch below. +# We take as input the parameters of a model and a learning rate. +# The function `zero_grad` sets the gradients of all parameters to zero, which we have to do before calling `loss.backward()`. +# Finally, the `step()` function tells the optimizer to update all weights based on their gradients. +# The template is setup below: + + +# %% +class OptimizerTemplate: + def __init__(self, params, lr): + self.params = list(params) + self.lr = lr + + def zero_grad(self): + # Set gradients of all parameters to zero + for p in self.params: + if p.grad is not None: + p.grad.detach_() # For second-order optimizers important + p.grad.zero_() + + @torch.no_grad() + def step(self): + # Apply update step to all parameters + for p in self.params: + if p.grad is None: # We skip parameters without any gradients + continue + self.update_param(p) + + def update_param(self, p): + # To be implemented in optimizer-specific classes + raise NotImplementedError + + +# %% [markdown] +# The first optimizer we are going to implement is the standard Stochastic Gradient Descent (SGD). +# SGD updates the parameters using the following equation: +# +# $$ +# \begin{split} +# w^{(t)} & = w^{(t-1)} - \eta \cdot g^{(t)} +# \end{split} +# $$ +# +# As simple as the equation is also our implementation of SGD: + + +# %% +class SGD(OptimizerTemplate): + def __init__(self, params, lr): + super().__init__(params, lr) + + def update_param(self, p): + p_update = -self.lr * p.grad + p.add_(p_update) # In-place update => saves memory and does not create computation graph + + +# %% [markdown] +# In the lecture, we also have discussed the concept of momentum which replaces the gradient in the update by an exponential average of all past gradients including the current one: +# +# $$ +# \begin{split} +# m^{(t)} & = \beta_1 m^{(t-1)} + (1 - \beta_1)\cdot g^{(t)}\\ +# w^{(t)} & = w^{(t-1)} - \eta \cdot m^{(t)}\\ +# \end{split} +# $$ +# +# Let's also implement it below: + + +# %% +class SGDMomentum(OptimizerTemplate): + def __init__(self, params, lr, momentum=0.0): + super().__init__(params, lr) + self.momentum = momentum # Corresponds to beta_1 in the equation above + self.param_momentum = {p: torch.zeros_like(p.data) for p in self.params} # Dict to store m_t + + def update_param(self, p): + self.param_momentum[p] = (1 - self.momentum) * p.grad + self.momentum * self.param_momentum[p] + p_update = -self.lr * self.param_momentum[p] + p.add_(p_update) + + +# %% [markdown] +# Finally, we arrive at Adam. +# Adam combines the idea of momentum with an adaptive learning rate, which is based on an exponential average of the squared gradients, i.e. the gradients norm. +# Furthermore, we add a bias correction for the momentum and adaptive learning rate for the first iterations: +# +# $$ +# \begin{split} +# m^{(t)} & = \beta_1 m^{(t-1)} + (1 - \beta_1)\cdot g^{(t)}\\ +# v^{(t)} & = \beta_2 v^{(t-1)} + (1 - \beta_2)\cdot \left(g^{(t)}\right)^2\\ +# \hat{m}^{(t)} & = \frac{m^{(t)}}{1-\beta^{t}_1}, \hat{v}^{(t)} = \frac{v^{(t)}}{1-\beta^{t}_2}\\ +# w^{(t)} & = w^{(t-1)} - \frac{\eta}{\sqrt{v^{(t)}} + \epsilon}\circ \hat{m}^{(t)}\\ +# \end{split} +# $$ +# +# Epsilon is a small constant used to improve numerical stability for very small gradient norms. +# Remember that the adaptive learning rate does not replace the learning +# rate hyperparameter $\eta$, but rather acts as an extra factor and +# ensures that the gradients of various parameters have a similar norm. + + +# %% +class Adam(OptimizerTemplate): + def __init__(self, params, lr, beta1=0.9, beta2=0.999, eps=1e-8): + super().__init__(params, lr) + self.beta1 = beta1 + self.beta2 = beta2 + self.eps = eps + self.param_step = {p: 0 for p in self.params} # Remembers "t" for each parameter for bias correction + self.param_momentum = {p: torch.zeros_like(p.data) for p in self.params} + self.param_2nd_momentum = {p: torch.zeros_like(p.data) for p in self.params} + + def update_param(self, p): + self.param_step[p] += 1 + + self.param_momentum[p] = (1 - self.beta1) * p.grad + self.beta1 * self.param_momentum[p] + self.param_2nd_momentum[p] = (1 - self.beta2) * (p.grad) ** 2 + self.beta2 * self.param_2nd_momentum[p] + + bias_correction_1 = 1 - self.beta1 ** self.param_step[p] + bias_correction_2 = 1 - self.beta2 ** self.param_step[p] + + p_2nd_mom = self.param_2nd_momentum[p] / bias_correction_2 + p_mom = self.param_momentum[p] / bias_correction_1 + p_lr = self.lr / (torch.sqrt(p_2nd_mom) + self.eps) + p_update = -p_lr * p_mom + + p.add_(p_update) + + +# %% [markdown] +# ### Comparing optimizers on model training +# +# After we have implemented three optimizers (SGD, SGD with momentum, and Adam), we can start to analyze and compare them. +# First, we test them on how well they can optimize a neural network on the FashionMNIST dataset. +# We use again our linear network, this time with a ReLU activation and the kaiming initialization, which we have found before to work well for ReLU-based networks. +# Note that the model is over-parameterized for this task, and we can achieve similar performance with a much smaller network (for example `100,100,100`). +# However, our main interest is in how well the optimizer can train *deep* +# neural networks, hence the over-parameterization. + +# %% +base_model = BaseNetwork(act_fn=nn.ReLU(), hidden_sizes=[512, 256, 256, 128]) +kaiming_init(base_model) + +# %% [markdown] +# For a fair comparison, we train the exact same model with the same seed with the three optimizers below. +# Feel free to change the hyperparameters if you want (however, you have to train your own model then). + +# %% +SGD_model = copy.deepcopy(base_model).to(device) +SGD_results = train_model( + SGD_model, "FashionMNIST_SGD", lambda params: SGD(params, lr=1e-1), max_epochs=40, batch_size=256 +) + +# %% +SGDMom_model = copy.deepcopy(base_model).to(device) +SGDMom_results = train_model( + SGDMom_model, + "FashionMNIST_SGDMom", + lambda params: SGDMomentum(params, lr=1e-1, momentum=0.9), + max_epochs=40, + batch_size=256, +) + +# %% +Adam_model = copy.deepcopy(base_model).to(device) +Adam_results = train_model( + Adam_model, "FashionMNIST_Adam", lambda params: Adam(params, lr=1e-3), max_epochs=40, batch_size=256 +) + +# %% [markdown] +# The result is that all optimizers perform similarly well with the given model. +# The differences are too small to find any significant conclusion. +# However, keep in mind that this can also be attributed to the initialization we chose. +# When changing the initialization to worse (e.g. constant initialization), Adam usually shows to be more robust because of its adaptive learning rate. +# To show the specific benefits of the optimizers, we will continue to +# look at some possible loss surfaces in which momentum and adaptive +# learning rate are crucial. + +# %% [markdown] +# ### Pathological curvatures +# +# A pathological curvature is a type of surface that is similar to ravines and is particularly tricky for plain SGD optimization. +# In words, pathological curvatures typically have a steep gradient in one direction with an optimum at the center, while in a second direction we have a slower gradient towards a (global) optimum. +# Let's first create an example surface of this and visualize it: + + +# %% +def pathological_curve_loss(w1, w2): + # Example of a pathological curvature. There are many more possible, feel free to experiment here! + x1_loss = torch.tanh(w1) ** 2 + 0.01 * torch.abs(w1) + x2_loss = torch.sigmoid(w2) + return x1_loss + x2_loss + + +# %% +def plot_curve( + curve_fn, x_range=(-5, 5), y_range=(-5, 5), plot_3d=False, cmap=cm.viridis, title="Pathological curvature" +): + fig = plt.figure() + ax = fig.gca() + if plot_3d: + ax = fig.add_subplot(projection="3d") + + x = torch.arange(x_range[0], x_range[1], (x_range[1] - x_range[0]) / 100.0) + y = torch.arange(y_range[0], y_range[1], (y_range[1] - y_range[0]) / 100.0) + x, y = torch.meshgrid([x, y]) + z = curve_fn(x, y) + x, y, z = x.numpy(), y.numpy(), z.numpy() + + if plot_3d: + ax.plot_surface(x, y, z, cmap=cmap, linewidth=1, color="#000", antialiased=False) + ax.set_zlabel("loss") + else: + ax.imshow(z.T[::-1], cmap=cmap, extent=(x_range[0], x_range[1], y_range[0], y_range[1])) + plt.title(title) + ax.set_xlabel(r"$w_1$") + ax.set_ylabel(r"$w_2$") + plt.tight_layout() + return ax + + +sns.reset_orig() +_ = plot_curve(pathological_curve_loss, plot_3d=True) +plt.show() + +# %% [markdown] +# In terms of optimization, you can image that $w_1$ and $w_2$ are weight parameters, and the curvature represents the loss surface over the space of $w_1$ and $w_2$. +# Note that in typical networks, we have many, many more parameters than two, and such curvatures can occur in multi-dimensional spaces as well. +# +# Ideally, our optimization algorithm would find the center of the ravine and focuses on optimizing the parameters towards the direction of $w_2$. +# However, if we encounter a point along the ridges, the gradient is much greater in $w_1$ than $w_2$, and we might end up jumping from one side to the other. +# Due to the large gradients, we would have to reduce our learning rate slowing down learning significantly. +# +# To test our algorithms, we can implement a simple function to train two parameters on such a surface: + + +# %% +def train_curve(optimizer_func, curve_func=pathological_curve_loss, num_updates=100, init=[5, 5]): + """ + Args: + optimizer_func: Constructor of the optimizer to use. Should only take a parameter list + curve_func: Loss function (e.g. pathological curvature) + num_updates: Number of updates/steps to take when optimizing + init: Initial values of parameters. Must be a list/tuple with two elements representing w_1 and w_2 + Returns: + Numpy array of shape [num_updates, 3] with [t,:2] being the parameter values at step t, and [t,2] the loss at t. + """ + weights = nn.Parameter(torch.FloatTensor(init), requires_grad=True) + optim = optimizer_func([weights]) + + list_points = [] + for _ in range(num_updates): + loss = curve_func(weights[0], weights[1]) + list_points.append(torch.cat([weights.data.detach(), loss.unsqueeze(dim=0).detach()], dim=0)) + optim.zero_grad() + loss.backward() + optim.step() + points = torch.stack(list_points, dim=0).numpy() + return points + + +# %% [markdown] +# Next, let's apply the different optimizers on our curvature. +# Note that we set a much higher learning rate for the optimization algorithms as you would in a standard neural network. +# This is because we only have 2 parameters instead of tens of thousands or even millions. + +# %% +SGD_points = train_curve(lambda params: SGD(params, lr=10)) +SGDMom_points = train_curve(lambda params: SGDMomentum(params, lr=10, momentum=0.9)) +Adam_points = train_curve(lambda params: Adam(params, lr=1)) + +# %% [markdown] +# To understand best how the different algorithms worked, we visualize the update step as a line plot through the loss surface. +# We will stick with a 2D representation for readability. + +# %% +all_points = np.concatenate([SGD_points, SGDMom_points, Adam_points], axis=0) +ax = plot_curve( + pathological_curve_loss, + x_range=(-np.absolute(all_points[:, 0]).max(), np.absolute(all_points[:, 0]).max()), + y_range=(all_points[:, 1].min(), all_points[:, 1].max()), + plot_3d=False, +) +ax.plot(SGD_points[:, 0], SGD_points[:, 1], color="red", marker="o", zorder=1, label="SGD") +ax.plot(SGDMom_points[:, 0], SGDMom_points[:, 1], color="blue", marker="o", zorder=2, label="SGDMom") +ax.plot(Adam_points[:, 0], Adam_points[:, 1], color="grey", marker="o", zorder=3, label="Adam") +plt.legend() +plt.show() + +# %% [markdown] +# We can clearly see that SGD is not able to find the center of the optimization curve and has a problem converging due to the steep gradients in $w_1$. +# In contrast, Adam and SGD with momentum nicely converge as the changing direction of $w_1$ is canceling itself out. +# On such surfaces, it is crucial to use momentum. + +# %% [markdown] +# ### Steep optima +# +# A second type of challenging loss surfaces are steep optima. +# In those, we have a larger part of the surface having very small gradients while around the optimum, we have very large gradients. +# For instance, take the following loss surfaces: + + +# %% +def bivar_gaussian(w1, w2, x_mean=0.0, y_mean=0.0, x_sig=1.0, y_sig=1.0): + norm = 1 / (2 * np.pi * x_sig * y_sig) + x_exp = (-1 * (w1 - x_mean) ** 2) / (2 * x_sig**2) + y_exp = (-1 * (w2 - y_mean) ** 2) / (2 * y_sig**2) + return norm * torch.exp(x_exp + y_exp) + + +def comb_func(w1, w2): + z = -bivar_gaussian(w1, w2, x_mean=1.0, y_mean=-0.5, x_sig=0.2, y_sig=0.2) + z -= bivar_gaussian(w1, w2, x_mean=-1.0, y_mean=0.5, x_sig=0.2, y_sig=0.2) + z -= bivar_gaussian(w1, w2, x_mean=-0.5, y_mean=-0.8, x_sig=0.2, y_sig=0.2) + return z + + +_ = plot_curve(comb_func, x_range=(-2, 2), y_range=(-2, 2), plot_3d=True, title="Steep optima") + +# %% [markdown] +# Most of the loss surface has very little to no gradients. +# However, close to the optima, we have very steep gradients. +# To reach the minimum when starting in a region with lower gradients, we expect an adaptive learning rate to be crucial. +# To verify this hypothesis, we can run our three optimizers on the surface: + +# %% +SGD_points = train_curve(lambda params: SGD(params, lr=0.5), comb_func, init=[0, 0]) +SGDMom_points = train_curve(lambda params: SGDMomentum(params, lr=1, momentum=0.9), comb_func, init=[0, 0]) +Adam_points = train_curve(lambda params: Adam(params, lr=0.2), comb_func, init=[0, 0]) + +all_points = np.concatenate([SGD_points, SGDMom_points, Adam_points], axis=0) +ax = plot_curve(comb_func, x_range=(-2, 2), y_range=(-2, 2), plot_3d=False, title="Steep optima") +ax.plot(SGD_points[:, 0], SGD_points[:, 1], color="red", marker="o", zorder=3, label="SGD", alpha=0.7) +ax.plot(SGDMom_points[:, 0], SGDMom_points[:, 1], color="blue", marker="o", zorder=2, label="SGDMom", alpha=0.7) +ax.plot(Adam_points[:, 0], Adam_points[:, 1], color="grey", marker="o", zorder=1, label="Adam", alpha=0.7) +ax.set_xlim(-2, 2) +ax.set_ylim(-2, 2) +plt.legend() +plt.show() + +# %% [markdown] +# SGD first takes very small steps until it touches the border of the optimum. +# First reaching a point around $(-0.75,-0.5)$, the gradient direction has changed and pushes the parameters to $(0.8,0.5)$ from which SGD cannot recover anymore (only with many, many steps). +# A similar problem has SGD with momentum, only that it continues the direction of the touch of the optimum. +# The gradients from this time step are so much larger than any other point that the momentum $m_t$ is overpowered by it. +# Finally, Adam is able to converge in the optimum showing the importance of adaptive learning rates. + +# %% [markdown] +# ### What optimizer to take +# +# After seeing the results on optimization, what is our conclusion? +# Should we always use Adam and never look at SGD anymore? +# The short answer: no. +# There are many papers saying that in certain situations, SGD (with momentum) generalizes better where Adam often tends to overfit [5,6]. +# This is related to the idea of finding wider optima. +# For instance, see the illustration of different optima below (credit: [Keskar et al., 2017](https://arxiv.org/pdf/1609.04836.pdf)): +# +#
+# +# The black line represents the training loss surface, while the dotted red line is the test loss. +# Finding sharp, narrow minima can be helpful for finding the minimal training loss. +# However, this doesn't mean that it also minimizes the test loss as especially flat minima have shown to generalize better. +# You can imagine that the test dataset has a slightly shifted loss surface due to the different examples than in the training set. +# A small change can have a significant influence for sharp minima, while flat minima are generally more robust to this change. +# +# In the next tutorial, we will see that some network types can still be better optimized with SGD and learning rate scheduling than Adam. +# Nevertheless, Adam is the most commonly used optimizer in Deep Learning +# as it usually performs better than other optimizers, especially for deep +# networks. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have looked at initialization and optimization techniques for neural networks. +# We have seen that a good initialization has to balance the preservation of the gradient variance as well as the activation variance. +# This can be achieved with the Xavier initialization for tanh-based networks, and the Kaiming initialization for ReLU-based networks. +# In optimization, concepts like momentum and adaptive learning rate can help with challenging loss surfaces but don't guarantee an increase in performance for neural networks. +# +# +# ## References +# +# [1] Glorot, Xavier, and Yoshua Bengio. +# "Understanding the difficulty of training deep feedforward neural networks." +# Proceedings of the thirteenth international conference on artificial intelligence and statistics. +# 2010. +# [link](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) +# +# [2] He, Kaiming, et al. +# "Delving deep into rectifiers: Surpassing human-level performance on imagenet classification." +# Proceedings of the IEEE international conference on computer vision. +# 2015. +# [link](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) +# +# [3] Kingma, Diederik P. & Ba, Jimmy. +# "Adam: A Method for Stochastic Optimization." +# Proceedings of the third international conference for learning representations (ICLR). +# 2015. +# [link](https://arxiv.org/abs/1412.6980) +# +# [4] Keskar, Nitish Shirish, et al. +# "On large-batch training for deep learning: Generalization gap and sharp minima." +# Proceedings of the fifth international conference for learning representations (ICLR). +# 2017. +# [link](https://arxiv.org/abs/1609.04836) +# +# [5] Wilson, Ashia C., et al. +# "The Marginal Value of Adaptive Gradient Methods in Machine Learning." +# Advances in neural information processing systems. +# 2017. +# [link](https://papers.nips.cc/paper/7003-the-marginal-value-of-adaptive-gradient-methods-in-machine-learning.pdf) +# +# [6] Ruder, Sebastian. +# "An overview of gradient descent optimization algorithms." +# arXiv preprint. +# 2017. +# [link](https://arxiv.org/abs/1609.04747) diff --git a/course_UvA-DL/03-initialization-and-optimization/flat_vs_sharp_minima.svg b/course_UvA-DL/03-initialization-and-optimization/flat_vs_sharp_minima.svg new file mode 100644 index 0000000..c7b6225 --- /dev/null +++ b/course_UvA-DL/03-initialization-and-optimization/flat_vs_sharp_minima.svg @@ -0,0 +1,1456 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/04-inception-resnet-densenet/.meta.yaml b/course_UvA-DL/04-inception-resnet-densenet/.meta.yaml new file mode 100644 index 0000000..dc7b7b0 --- /dev/null +++ b/course_UvA-DL/04-inception-resnet-densenet/.meta.yaml @@ -0,0 +1,23 @@ +title: "Tutorial 4: Inception, ResNet and DenseNet" +author: Phillip Lippe +created: 2021-08-27 +updated: 2023-03-14 +license: CC BY-SA +tags: + - Image +description: | + In this tutorial, we will implement and discuss variants of modern CNN architectures. + There have been many different architectures been proposed over the past few years. + Some of the most impactful ones, and still relevant today, are the following: [GoogleNet](https://arxiv.org/abs/1409.4842)/Inception architecture (winner of ILSVRC 2014), [ResNet](https://arxiv.org/abs/1512.03385) (winner of ILSVRC 2015), and [DenseNet](https://arxiv.org/abs/1608.06993) (best paper award CVPR 2017). + All of them were state-of-the-art models when being proposed, and the core ideas of these networks are the foundations for most current state-of-the-art architectures. + Thus, it is important to understand these architectures in detail and learn how to implement them. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - tabulate + - lightning>=2.0.0rc0 +accelerator: + - GPU diff --git a/course_UvA-DL/04-inception-resnet-densenet/.thumb.jpg b/course_UvA-DL/04-inception-resnet-densenet/.thumb.jpg new file mode 100644 index 0000000..a7e0205 Binary files /dev/null and b/course_UvA-DL/04-inception-resnet-densenet/.thumb.jpg differ diff --git a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py new file mode 100644 index 0000000..5d5def3 --- /dev/null +++ b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py @@ -0,0 +1,1215 @@ +# %% [markdown] +#
+# Let's start with importing our standard libraries here. + +# %% +import os +import urllib.request +from types import SimpleNamespace +from urllib.error import HTTPError + +import lightning as L +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns +import tabulate +import torch +import torch.nn as nn +import torch.optim as optim +import torch.utils.data as data +import torchvision + +# %matplotlib inline +from IPython.display import HTML, display +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from PIL import Image +from torchvision import transforms +from torchvision.datasets import CIFAR10 + +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.reset_orig() + +# PyTorch +# Torchvision + +# %% [markdown] +# We will use the same `set_seed` function as in the previous tutorials, as well as the path variables `DATASET_PATH` and `CHECKPOINT_PATH`. +# Adjust the paths if necessary. + +# %% +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/ConvNets") + + +# Function for setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") + +# %% [markdown] +# We also have pretrained models and Tensorboards (more on this later) for this tutorial, and download them below. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial5/" +# Files to download +pretrained_files = [ + "GoogleNet.ckpt", + "ResNet.ckpt", + "ResNetPreAct.ckpt", + "DenseNet.ckpt", + "tensorboards/GoogleNet/events.out.tfevents.googlenet", + "tensorboards/ResNet/events.out.tfevents.resnet", + "tensorboards/ResNetPreAct/events.out.tfevents.resnetpreact", + "tensorboards/DenseNet/events.out.tfevents.densenet", +] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if "/" in file_name: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print(f"Downloading {file_url}...") + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# Throughout this tutorial, we will train and evaluate the models on the CIFAR10 dataset. +# This allows you to compare the results obtained here with the model you have implemented in the first assignment. +# As we have learned from the previous tutorial about initialization, it is important to have the data preprocessed with a zero mean. +# Therefore, as a first step, we will calculate the mean and standard deviation of the CIFAR dataset: + +# %% +train_dataset = CIFAR10(root=DATASET_PATH, train=True, download=True) +DATA_MEANS = (train_dataset.data / 255.0).mean(axis=(0, 1, 2)) +DATA_STD = (train_dataset.data / 255.0).std(axis=(0, 1, 2)) +print("Data mean", DATA_MEANS) +print("Data std", DATA_STD) + +# %% [markdown] +# We will use this information to define a `transforms.Normalize` module which will normalize our data accordingly. +# Additionally, we will use data augmentation during training. +# This reduces the risk of overfitting and helps CNNs to generalize better. +# Specifically, we will apply two random augmentations. +# +# First, we will flip each image horizontally by a chance of 50% (`transforms.RandomHorizontalFlip`). +# The object class usually does not change when flipping an image, and we don't expect any image information to be dependent on the horizontal orientation. +# This would be however different if we would try to detect digits or letters in an image, as those have a certain orientation. +# +# The second augmentation we use is called `transforms.RandomResizedCrop`. +# This transformation scales the image in a small range, while eventually changing the aspect ratio, and crops it afterward in the previous size. +# Therefore, the actual pixel values change while the content or overall semantics of the image stays the same. +# +# We will randomly split the training dataset into a training and a validation set. +# The validation set will be used for determining early stopping. +# After finishing the training, we test the models on the CIFAR test set. + +# %% +test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(DATA_MEANS, DATA_STD)]) +# For training, we add some augmentation. Networks are too powerful and would overfit. +train_transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)), + transforms.ToTensor(), + transforms.Normalize(DATA_MEANS, DATA_STD), + ] +) +# Loading the training dataset. We need to split it into a training and validation part +# We need to do a little trick because the validation set should not use the augmentation. +train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=train_transform, download=True) +val_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=test_transform, download=True) +L.seed_everything(42) +train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000]) +L.seed_everything(42) +_, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000]) + +# Loading the test set +test_set = CIFAR10(root=DATASET_PATH, train=False, transform=test_transform, download=True) + +# We define a set of data loaders that we can use for various purposes later. +train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=4) +val_loader = data.DataLoader(val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) +test_loader = data.DataLoader(test_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) + +# %% [markdown] +# To verify that our normalization works, we can print out the mean and standard deviation of the single batch. +# The mean should be close to 0 and the standard deviation close to 1 for each channel: + +# %% +imgs, _ = next(iter(train_loader)) +print("Batch mean", imgs.mean(dim=[0, 2, 3])) +print("Batch std", imgs.std(dim=[0, 2, 3])) + +# %% [markdown] +# Finally, let's visualize a few images from the training set, and how they look like after random data augmentation: + +# %% +NUM_IMAGES = 4 +images = [train_dataset[idx][0] for idx in range(NUM_IMAGES)] +orig_images = [Image.fromarray(train_dataset.data[idx]) for idx in range(NUM_IMAGES)] +orig_images = [test_transform(img) for img in orig_images] + +img_grid = torchvision.utils.make_grid(torch.stack(images + orig_images, dim=0), nrow=4, normalize=True, pad_value=0.5) +img_grid = img_grid.permute(1, 2, 0) + +plt.figure(figsize=(8, 8)) +plt.title("Augmentation examples on CIFAR10") +plt.imshow(img_grid) +plt.axis("off") +plt.show() +plt.close() + +# %% [markdown] +# ## PyTorch Lightning +# +# In this notebook and in many following ones, we will make use of the library [PyTorch Lightning](https://www.lightning.ai/docs/pytorch/stable). +# PyTorch Lightning is a framework that simplifies your code needed to train, evaluate, and test a model in PyTorch. +# It also handles logging into [TensorBoard](https://pytorch.org/tutorials/intermediate/tensorboard_tutorial.html), a visualization toolkit for ML experiments, and saving model checkpoints automatically with minimal code overhead from our side. +# This is extremely helpful for us as we want to focus on implementing different model architectures and spend little time on other code overhead. +# Note that at the time of writing/teaching, the framework has been released in version 1.3. +# Future versions might have a slightly changed interface and thus might not work perfectly with the code (we will try to keep it up-to-date as much as possible). +# +# Now, we will take the first step in PyTorch Lightning, and continue to explore the framework in our other tutorials. +# PyTorch Lightning comes with a lot of useful functions, such as one for setting the seed as we have seen before: + +# %% +# Setting the seed +L.seed_everything(42) + +# %% [markdown] +# Thus, in the future, we don't have to define our own `set_seed` function anymore. +# +# In PyTorch Lightning, we define `L.LightningModule`'s (inheriting from `Module`) that organize our code into 5 main sections: +# +# 1. Initialization (`__init__`), where we create all necessary parameters/models +# 2. Optimizers (`configure_optimizers`) where we create the optimizers, learning rate scheduler, etc. +# 3. +# Training loop (`training_step`) where we only have to define the loss calculation for a single batch (the loop of optimizer.zero_grad(), loss.backward() and optimizer.step(), as well as any logging/saving operation, is done in the background) +# 4. +# Validation loop (`validation_step`) where similarly to the training, we only have to define what should happen per step +# 5. Test loop (`test_step`) which is the same as validation, only on a test set. +# +# Therefore, we don't abstract the PyTorch code, but rather organize it and define some default operations that are commonly used. +# If you need to change something else in your training/validation/test loop, there are many possible functions you can overwrite (see the [docs](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html) for details). +# +# Now we can look at an example of how a Lightning Module for training a CNN looks like: + + +# %% +class CIFARModule(L.LightningModule): + def __init__(self, model_name, model_hparams, optimizer_name, optimizer_hparams): + """CIFARModule. + + Args: + model_name: Name of the model/CNN to run. Used for creating the model (see function below) + model_hparams: Hyperparameters for the model, as dictionary. + optimizer_name: Name of the optimizer to use. Currently supported: Adam, SGD + optimizer_hparams: Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc. + """ + super().__init__() + # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace + self.save_hyperparameters() + # Create model + self.model = create_model(model_name, model_hparams) + # Create loss module + self.loss_module = nn.CrossEntropyLoss() + # Example input for visualizing the graph in Tensorboard + self.example_input_array = torch.zeros((1, 3, 32, 32), dtype=torch.float32) + + def forward(self, imgs): + # Forward function that is run when visualizing the graph + return self.model(imgs) + + def configure_optimizers(self): + # We will support Adam or SGD as optimizers. + if self.hparams.optimizer_name == "Adam": + # AdamW is Adam with a correct implementation of weight decay (see here + # for details: https://arxiv.org/pdf/1711.05101.pdf) + optimizer = optim.AdamW(self.parameters(), **self.hparams.optimizer_hparams) + elif self.hparams.optimizer_name == "SGD": + optimizer = optim.SGD(self.parameters(), **self.hparams.optimizer_hparams) + else: + assert False, f'Unknown optimizer: "{self.hparams.optimizer_name}"' + + # We will reduce the learning rate by 0.1 after 100 and 150 epochs + scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1) + return [optimizer], [scheduler] + + def training_step(self, batch, batch_idx): + # "batch" is the output of the training data loader. + imgs, labels = batch + preds = self.model(imgs) + loss = self.loss_module(preds, labels) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + # Logs the accuracy per epoch to tensorboard (weighted average over batches) + self.log("train_acc", acc, on_step=False, on_epoch=True) + self.log("train_loss", loss) + return loss # Return tensor to call ".backward" on + + def validation_step(self, batch, batch_idx): + imgs, labels = batch + preds = self.model(imgs).argmax(dim=-1) + acc = (labels == preds).float().mean() + # By default logs it per epoch (weighted average over batches) + self.log("val_acc", acc) + + def test_step(self, batch, batch_idx): + imgs, labels = batch + preds = self.model(imgs).argmax(dim=-1) + acc = (labels == preds).float().mean() + # By default logs it per epoch (weighted average over batches), and returns it afterwards + self.log("test_acc", acc) + + +# %% [markdown] +# We see that the code is organized and clear, which helps if someone else tries to understand your code. +# +# Another important part of PyTorch Lightning is the concept of callbacks. +# Callbacks are self-contained functions that contain the non-essential logic of your Lightning Module. +# They are usually called after finishing a training epoch, but can also influence other parts of your training loop. +# For instance, we will use the following two pre-defined callbacks: `LearningRateMonitor` and `ModelCheckpoint`. +# The learning rate monitor adds the current learning rate to our TensorBoard, which helps to verify that our learning rate scheduler works correctly. +# The model checkpoint callback allows you to customize the saving routine of your checkpoints. +# For instance, how many checkpoints to keep, when to save, which metric to look out for, etc. +# We import them below: + +# %% +# Callbacks + +# %% [markdown] +# To allow running multiple different models with the same Lightning module, we define a function below that maps a model name to the model class. +# At this stage, the dictionary `model_dict` is empty, but we will fill it throughout the notebook with our new models. + +# %% +model_dict = {} + + +def create_model(model_name, model_hparams): + if model_name in model_dict: + return model_dict[model_name](**model_hparams) + else: + assert False, f'Unknown model name "{model_name}". Available models are: {str(model_dict.keys())}' + + +# %% [markdown] +# Similarly, to use the activation function as another hyperparameter in +# our model, we define a "name to function" dict below: + +# %% +act_fn_by_name = {"tanh": nn.Tanh, "relu": nn.ReLU, "leakyrelu": nn.LeakyReLU, "gelu": nn.GELU} + +# %% [markdown] +# If we pass the classes or objects directly as an argument to the Lightning module, we couldn't take advantage of PyTorch Lightning's automatically hyperparameter saving and loading. +# +# Besides the Lightning module, the second most important module in PyTorch Lightning is the `Trainer`. +# The trainer is responsible to execute the training steps defined in the Lightning module and completes the framework. +# Similar to the Lightning module, you can override any key part that you don't want to be automated, but the default settings are often the best practice to do. +# For a full overview, see the [documentation](https://lightning.ai/docs/pytorch/stable/common/trainer.html). +# The most important functions we use below are: +# +# * `trainer.fit`: Takes as input a lightning module, a training dataset, and an (optional) validation dataset. +# This function trains the given module on the training dataset with occasional validation (default once per epoch, can be changed) +# * `trainer.test`: Takes as input a model and a dataset on which we want to test. +# It returns the test metric on the dataset. +# +# For training and testing, we don't have to worry about things like setting the model to eval mode (`model.eval()`) as this is all done automatically. +# See below how we define a training function for our models: + + +# %% +def train_model(model_name, save_name=None, **kwargs): + """Train model. + + Args: + model_name: Name of the model you want to run. Is used to look up the class in "model_dict" + save_name (optional): If specified, this name will be used for creating the checkpoint and logging directory. + """ + if save_name is None: + save_name = model_name + + # Create a PyTorch Lightning trainer with the generation callback + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, save_name), # Where to save models + # We run on a single GPU (if possible) + accelerator="auto", + devices=1, + # How many epochs to train for if no patience is set + max_epochs=180, + callbacks=[ + ModelCheckpoint( + save_weights_only=True, mode="max", monitor="val_acc" + ), # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer + LearningRateMonitor("epoch"), + ], # Log learning rate every epoch + ) # In case your notebook crashes due to the progress bar, consider increasing the refresh rate + trainer.logger._log_graph = True # If True, we plot the computation graph in tensorboard + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, save_name + ".ckpt") + if os.path.isfile(pretrained_filename): + print(f"Found pretrained model at {pretrained_filename}, loading...") + # Automatically loads the model with the saved hyperparameters + model = CIFARModule.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) # To be reproducable + model = CIFARModule(model_name=model_name, **kwargs) + trainer.fit(model, train_loader, val_loader) + model = CIFARModule.load_from_checkpoint( + trainer.checkpoint_callback.best_model_path + ) # Load best checkpoint after training + + # Test best model on validation and test set + val_result = trainer.test(model, dataloaders=val_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"test": test_result[0]["test_acc"], "val": val_result[0]["test_acc"]} + + return model, result + + +# %% [markdown] +# Finally, we can focus on the Convolutional Neural Networks we want to +# implement today: GoogleNet, ResNet, and DenseNet. + +# %% [markdown] +# ## Inception +# +#
+# +# The [GoogleNet](https://arxiv.org/abs/1409.4842), proposed in 2014, won the ImageNet Challenge because of its usage of the Inception modules. +# In general, we will mainly focus on the concept of Inception in this tutorial instead of the specifics of the GoogleNet, as based on Inception, there have been many follow-up works ([Inception-v2](https://arxiv.org/abs/1512.00567), [Inception-v3](https://arxiv.org/abs/1512.00567), [Inception-v4](https://arxiv.org/abs/1602.07261), [Inception-ResNet](https://arxiv.org/abs/1602.07261),...). +# The follow-up works mainly focus on increasing efficiency and enabling very deep Inception networks. +# However, for a fundamental understanding, it is sufficient to look at the original Inception block. +# +# An Inception block applies four convolution blocks separately on the same feature map: a 1x1, 3x3, and 5x5 convolution, and a max pool operation. +# This allows the network to look at the same data with different receptive fields. +# Of course, learning only 5x5 convolution would be theoretically more powerful. +# However, this is not only more computation and memory heavy but also tends to overfit much easier. +# The overall inception block looks like below (figure credit - [Szegedy et al. ](https://arxiv.org/abs/1409.4842)): +# +#
+# +# The additional 1x1 convolutions before the 3x3 and 5x5 convolutions are used for dimensionality reduction. +# This is especially crucial as the feature maps of all branches are merged afterward, and we don't want any explosion of feature size. +# As 5x5 convolutions are 25 times more expensive than 1x1 convolutions, we can save a lot of computation and parameters by reducing the dimensionality before the large convolutions. +# +# We can now try to implement the Inception Block ourselves: + + +# %% +class InceptionBlock(nn.Module): + def __init__(self, c_in, c_red: dict, c_out: dict, act_fn): + """InceptionBlock. + + Args: + c_in: Number of input feature maps from the previous layers + c_red: Dictionary with keys "3x3" and "5x5" specifying the output of the dimensionality reducing 1x1 convolutions + c_out: Dictionary with keys "1x1", "3x3", "5x5", and "max" + act_fn: Activation class constructor (e.g. nn.ReLU) + """ + super().__init__() + + # 1x1 convolution branch + self.conv_1x1 = nn.Sequential( + nn.Conv2d(c_in, c_out["1x1"], kernel_size=1), nn.BatchNorm2d(c_out["1x1"]), act_fn() + ) + + # 3x3 convolution branch + self.conv_3x3 = nn.Sequential( + nn.Conv2d(c_in, c_red["3x3"], kernel_size=1), + nn.BatchNorm2d(c_red["3x3"]), + act_fn(), + nn.Conv2d(c_red["3x3"], c_out["3x3"], kernel_size=3, padding=1), + nn.BatchNorm2d(c_out["3x3"]), + act_fn(), + ) + + # 5x5 convolution branch + self.conv_5x5 = nn.Sequential( + nn.Conv2d(c_in, c_red["5x5"], kernel_size=1), + nn.BatchNorm2d(c_red["5x5"]), + act_fn(), + nn.Conv2d(c_red["5x5"], c_out["5x5"], kernel_size=5, padding=2), + nn.BatchNorm2d(c_out["5x5"]), + act_fn(), + ) + + # Max-pool branch + self.max_pool = nn.Sequential( + nn.MaxPool2d(kernel_size=3, padding=1, stride=1), + nn.Conv2d(c_in, c_out["max"], kernel_size=1), + nn.BatchNorm2d(c_out["max"]), + act_fn(), + ) + + def forward(self, x): + x_1x1 = self.conv_1x1(x) + x_3x3 = self.conv_3x3(x) + x_5x5 = self.conv_5x5(x) + x_max = self.max_pool(x) + x_out = torch.cat([x_1x1, x_3x3, x_5x5, x_max], dim=1) + return x_out + + +# %% [markdown] +# The GoogleNet architecture consists of stacking multiple Inception blocks with occasional max pooling to reduce the height and width of the feature maps. +# The original GoogleNet was designed for image sizes of ImageNet (224x224 pixels) and had almost 7 million parameters. +# As we train on CIFAR10 with image sizes of 32x32, we don't require such a heavy architecture, and instead, apply a reduced version. +# The number of channels for dimensionality reduction and output per filter (1x1, 3x3, 5x5, and max pooling) need to be manually specified and can be changed if interested. +# The general intuition is to have the most filters for the 3x3 +# convolutions, as they are powerful enough to take the context into +# account while requiring almost a third of the parameters of the 5x5 +# convolution. + + +# %% +class GoogleNet(nn.Module): + def __init__(self, num_classes=10, act_fn_name="relu", **kwargs): + super().__init__() + self.hparams = SimpleNamespace( + num_classes=num_classes, act_fn_name=act_fn_name, act_fn=act_fn_by_name[act_fn_name] + ) + self._create_network() + self._init_params() + + def _create_network(self): + # A first convolution on the original image to scale up the channel size + self.input_net = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64), self.hparams.act_fn() + ) + # Stacking inception blocks + self.inception_blocks = nn.Sequential( + InceptionBlock( + 64, + c_red={"3x3": 32, "5x5": 16}, + c_out={"1x1": 16, "3x3": 32, "5x5": 8, "max": 8}, + act_fn=self.hparams.act_fn, + ), + InceptionBlock( + 64, + c_red={"3x3": 32, "5x5": 16}, + c_out={"1x1": 24, "3x3": 48, "5x5": 12, "max": 12}, + act_fn=self.hparams.act_fn, + ), + nn.MaxPool2d(3, stride=2, padding=1), # 32x32 => 16x16 + InceptionBlock( + 96, + c_red={"3x3": 32, "5x5": 16}, + c_out={"1x1": 24, "3x3": 48, "5x5": 12, "max": 12}, + act_fn=self.hparams.act_fn, + ), + InceptionBlock( + 96, + c_red={"3x3": 32, "5x5": 16}, + c_out={"1x1": 16, "3x3": 48, "5x5": 16, "max": 16}, + act_fn=self.hparams.act_fn, + ), + InceptionBlock( + 96, + c_red={"3x3": 32, "5x5": 16}, + c_out={"1x1": 16, "3x3": 48, "5x5": 16, "max": 16}, + act_fn=self.hparams.act_fn, + ), + InceptionBlock( + 96, + c_red={"3x3": 32, "5x5": 16}, + c_out={"1x1": 32, "3x3": 48, "5x5": 24, "max": 24}, + act_fn=self.hparams.act_fn, + ), + nn.MaxPool2d(3, stride=2, padding=1), # 16x16 => 8x8 + InceptionBlock( + 128, + c_red={"3x3": 48, "5x5": 16}, + c_out={"1x1": 32, "3x3": 64, "5x5": 16, "max": 16}, + act_fn=self.hparams.act_fn, + ), + InceptionBlock( + 128, + c_red={"3x3": 48, "5x5": 16}, + c_out={"1x1": 32, "3x3": 64, "5x5": 16, "max": 16}, + act_fn=self.hparams.act_fn, + ), + ) + # Mapping to classification output + self.output_net = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(), nn.Linear(128, self.hparams.num_classes) + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the + # convolutions according to the activation function + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.input_net(x) + x = self.inception_blocks(x) + x = self.output_net(x) + return x + + +# %% [markdown] +# Now, we can integrate our model to the model dictionary we defined above: + +# %% +model_dict["GoogleNet"] = GoogleNet + +# %% [markdown] +# The training of the model is handled by PyTorch Lightning, and we just have to define the command to start. +# Note that we train for almost 200 epochs, which takes about an hour on Lisa's default GPUs (GTX1080Ti). +# We would recommend using the saved models and train your own model if you are interested. + +# %% +googlenet_model, googlenet_results = train_model( + model_name="GoogleNet", + model_hparams={"num_classes": 10, "act_fn_name": "relu"}, + optimizer_name="Adam", + optimizer_hparams={"lr": 1e-3, "weight_decay": 1e-4}, +) + +# %% [markdown] +# We will compare the results later in the notebooks, but we can already print them here for a first glance: + +# %% +print("GoogleNet Results", googlenet_results) + +# %% [markdown] +# ### Tensorboard log +# +# A nice extra of PyTorch Lightning is the automatic logging into TensorBoard. +# To give you a better intuition of what TensorBoard can be used, we can look at the board that PyTorch Lightning has been generated when training the GoogleNet. +# TensorBoard provides an inline functionality for Jupyter notebooks, and we use it here: + +# %% +# Import tensorboard +# %load_ext tensorboard + +# %% +# Opens tensorboard in notebook. Adjust the path to your CHECKPOINT_PATH! +# %tensorboard --logdir ../saved_models/tutorial5/tensorboards/GoogleNet/ + +# %% [markdown] +#
+# +# TensorBoard is organized in multiple tabs. +# The main tab is the scalar tab where we can log the development of single numbers. +# For example, we have plotted the training loss, accuracy, learning rate, etc. +# If we look at the training or validation accuracy, we can really see the impact of using a learning rate scheduler. +# Reducing the learning rate gives our model a nice increase in training performance. +# Similarly, when looking at the training loss, we see a sudden decrease at this point. +# However, the high numbers on the training set compared to validation indicate that our model was overfitting which is inevitable for such large networks. +# +# Another interesting tab in TensorBoard is the graph tab. +# It shows us the network architecture organized by building blocks from the input to the output. +# It basically shows the operations taken in the forward step of `CIFARModule`. +# Double-click on a module to open it. +# Feel free to explore the architecture from a different perspective. +# The graph visualization can often help you to validate that your model +# is actually doing what it is supposed to do, and you don't miss any +# layers in the computation graph. + +# %% [markdown] +# ## ResNet +# +# The [ResNet](https://arxiv.org/abs/1512.03385) paper is one of the [most cited AI papers](https://www.natureindex.com/news-blog/google-scholar-reveals-most-influential-papers-research-citations-twenty-twenty), and has been the foundation for neural networks with more than 1,000 layers. +# Despite its simplicity, the idea of residual connections is highly effective as it supports stable gradient propagation through the network. +# Instead of modeling $x_{l+1}=F(x_{l})$, we model $x_{l+1}=x_{l}+F(x_{l})$ where $F$ is a non-linear mapping (usually a sequence of NN modules likes convolutions, activation functions, and normalizations). +# If we do backpropagation on such residual connections, we obtain: +# +# $$\frac{\partial x_{l+1}}{\partial x_{l}} = \mathbf{I} + \frac{\partial F(x_{l})}{\partial x_{l}}$$ +# +# The bias towards the identity matrix guarantees a stable gradient propagation being less effected by $F$ itself. +# There have been many variants of ResNet proposed, which mostly concern the function $F$, or operations applied on the sum. +# In this tutorial, we look at two of them: the original ResNet block, and the [Pre-Activation ResNet block](https://arxiv.org/abs/1603.05027). +# We visually compare the blocks below (figure credit - [He et al. ](https://arxiv.org/abs/1603.05027)): +# +#
+# +# The original ResNet block applies a non-linear activation function, usually ReLU, after the skip connection. +# In contrast, the pre-activation ResNet block applies the non-linearity at the beginning of $F$. +# Both have their advantages and disadvantages. +# For very deep network, however, the pre-activation ResNet has shown to perform better as the gradient flow is guaranteed to have the identity matrix as calculated above, and is not harmed by any non-linear activation applied to it. +# For comparison, in this notebook, we implement both ResNet types as shallow networks. +# +# Let's start with the original ResNet block. +# The visualization above already shows what layers are included in $F$. +# One special case we have to handle is when we want to reduce the image dimensions in terms of width and height. +# The basic ResNet block requires $F(x_{l})$ to be of the same shape as $x_{l}$. +# Thus, we need to change the dimensionality of $x_{l}$ as well before adding to $F(x_{l})$. +# The original implementation used an identity mapping with stride 2 and padded additional feature dimensions with 0. +# However, the more common implementation is to use a 1x1 convolution with stride 2 as it allows us to change the feature dimensionality while being efficient in parameter and computation cost. +# The code for the ResNet block is relatively simple, and shown below: + +# %% + + +class ResNetBlock(nn.Module): + def __init__(self, c_in, act_fn, subsample=False, c_out=-1): + """ResNetBlock. + + Args: + c_in: Number of input features + act_fn: Activation class constructor (e.g. nn.ReLU) + subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width + c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in + """ + super().__init__() + if not subsample: + c_out = c_in + + # Network representing F + self.net = nn.Sequential( + nn.Conv2d( + c_in, c_out, kernel_size=3, padding=1, stride=1 if not subsample else 2, bias=False + ), # No bias needed as the Batch Norm handles it + nn.BatchNorm2d(c_out), + act_fn(), + nn.Conv2d(c_out, c_out, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(c_out), + ) + + # 1x1 convolution with stride 2 means we take the upper left value, and transform it to new output size + self.downsample = nn.Conv2d(c_in, c_out, kernel_size=1, stride=2) if subsample else None + self.act_fn = act_fn() + + def forward(self, x): + z = self.net(x) + if self.downsample is not None: + x = self.downsample(x) + out = z + x + out = self.act_fn(out) + return out + + +# %% [markdown] +# The second block we implement is the pre-activation ResNet block. +# For this, we have to change the order of layer in `self.net`, and do not apply an activation function on the output. +# Additionally, the downsampling operation has to apply a non-linearity as well as the input, $x_l$, has not been processed by a non-linearity yet. +# Hence, the block looks as follows: + + +# %% +class PreActResNetBlock(nn.Module): + def __init__(self, c_in, act_fn, subsample=False, c_out=-1): + """PreAct ResNet Block. + + Args: + c_in - Number of input features + act_fn - Activation class constructor (e.g. nn.ReLU) + subsample - If True, we want to apply a stride inside the block and reduce the output shape by 2 in height and width + c_out - Number of output features. Note that this is only relevant if subsample is True, as otherwise, c_out = c_in + """ + super().__init__() + if not subsample: + c_out = c_in + + # Network representing F + self.net = nn.Sequential( + nn.BatchNorm2d(c_in), + act_fn(), + nn.Conv2d(c_in, c_out, kernel_size=3, padding=1, stride=1 if not subsample else 2, bias=False), + nn.BatchNorm2d(c_out), + act_fn(), + nn.Conv2d(c_out, c_out, kernel_size=3, padding=1, bias=False), + ) + + # 1x1 convolution needs to apply non-linearity as well as not done on skip connection + self.downsample = ( + nn.Sequential(nn.BatchNorm2d(c_in), act_fn(), nn.Conv2d(c_in, c_out, kernel_size=1, stride=2, bias=False)) + if subsample + else None + ) + + def forward(self, x): + z = self.net(x) + if self.downsample is not None: + x = self.downsample(x) + out = z + x + return out + + +# %% [markdown] +# Similarly to the model selection, we define a dictionary to create a mapping from string to block class. +# We will use the string name as hyperparameter value in our model to choose between the ResNet blocks. +# Feel free to implement any other ResNet block type and add it here as well. + +# %% +resnet_blocks_by_name = {"ResNetBlock": ResNetBlock, "PreActResNetBlock": PreActResNetBlock} + +# %% [markdown] +# The overall ResNet architecture consists of stacking multiple ResNet blocks, of which some are downsampling the input. +# When talking about ResNet blocks in the whole network, we usually group them by the same output shape. +# Hence, if we say the ResNet has `[3,3,3]` blocks, it means that we have 3 times a group of 3 ResNet blocks, where a subsampling is taking place in the fourth and seventh block. +# The ResNet with `[3,3,3]` blocks on CIFAR10 is visualized below. +# +#
+# +# The three groups operate on the resolutions $32\times32$, $16\times16$ and $8\times8$ respectively. +# The blocks in orange denote ResNet blocks with downsampling. +# The same notation is used by many other implementations such as in the [torchvision library](https://pytorch.org/vision/0.11/models.html#torchvision.models.resnet18) from PyTorch. +# Thus, our code looks as follows: + + +# %% +class ResNet(nn.Module): + def __init__( + self, + num_classes=10, + num_blocks=[3, 3, 3], + c_hidden=[16, 32, 64], + act_fn_name="relu", + block_name="ResNetBlock", + **kwargs, + ): + """ResNet. + + Args: + num_classes - Number of classification outputs (10 for CIFAR10) + num_blocks - List with the number of ResNet blocks to use. The first block of each group uses downsampling, except the first. + c_hidden - List with the hidden dimensionalities in the different blocks. Usually multiplied by 2 the deeper we go. + act_fn_name - Name of the activation function to use, looked up in "act_fn_by_name" + block_name - Name of the ResNet block, looked up in "resnet_blocks_by_name" + """ + super().__init__() + assert block_name in resnet_blocks_by_name + self.hparams = SimpleNamespace( + num_classes=num_classes, + c_hidden=c_hidden, + num_blocks=num_blocks, + act_fn_name=act_fn_name, + act_fn=act_fn_by_name[act_fn_name], + block_class=resnet_blocks_by_name[block_name], + ) + self._create_network() + self._init_params() + + def _create_network(self): + c_hidden = self.hparams.c_hidden + + # A first convolution on the original image to scale up the channel size + if self.hparams.block_class == PreActResNetBlock: # => Don't apply non-linearity on output + self.input_net = nn.Sequential(nn.Conv2d(3, c_hidden[0], kernel_size=3, padding=1, bias=False)) + else: + self.input_net = nn.Sequential( + nn.Conv2d(3, c_hidden[0], kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(c_hidden[0]), + self.hparams.act_fn(), + ) + + # Creating the ResNet blocks + blocks = [] + for block_idx, block_count in enumerate(self.hparams.num_blocks): + for bc in range(block_count): + # Subsample the first block of each group, except the very first one. + subsample = bc == 0 and block_idx > 0 + blocks.append( + self.hparams.block_class( + c_in=c_hidden[block_idx if not subsample else (block_idx - 1)], + act_fn=self.hparams.act_fn, + subsample=subsample, + c_out=c_hidden[block_idx], + ) + ) + self.blocks = nn.Sequential(*blocks) + + # Mapping to classification output + self.output_net = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(), nn.Linear(c_hidden[-1], self.hparams.num_classes) + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the convolutions according to the activation function + # Fan-out focuses on the gradient distribution, and is commonly used in ResNets + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.input_net(x) + x = self.blocks(x) + x = self.output_net(x) + return x + + +# %% [markdown] +# We also need to add the new ResNet class to our model dictionary: + +# %% +model_dict["ResNet"] = ResNet + +# %% [markdown] +# Finally, we can train our ResNet models. +# One difference to the GoogleNet training is that we explicitly use SGD with Momentum as optimizer instead of Adam. +# Adam often leads to a slightly worse accuracy on plain, shallow ResNets. +# It is not 100% clear why Adam performs worse in this context, but one possible explanation is related to ResNet's loss surface. +# ResNet has been shown to produce smoother loss surfaces than networks without skip connection (see [Li et al., 2018](https://arxiv.org/pdf/1712.09913.pdf) for details). +# A possible visualization of the loss surface with/out skip connections is below (figure credit - [Li et al. ](https://arxiv.org/pdf/1712.09913.pdf)): +# +#
+# +# The $x$ and $y$ axis shows a projection of the parameter space, and the $z$ axis shows the loss values achieved by different parameter values. +# On smooth surfaces like the one on the right, we might not require an adaptive learning rate as Adam provides. +# Instead, Adam can get stuck in local optima while SGD finds the wider minima that tend to generalize better. +# However, to answer this question in detail, we would need an extra tutorial because it is not easy to answer. +# For now, we conclude: for ResNet architectures, consider the optimizer to be an important hyperparameter, and try training with both Adam and SGD. +# Let's train the model below with SGD: + +# %% +resnet_model, resnet_results = train_model( + model_name="ResNet", + model_hparams={"num_classes": 10, "c_hidden": [16, 32, 64], "num_blocks": [3, 3, 3], "act_fn_name": "relu"}, + optimizer_name="SGD", + optimizer_hparams={"lr": 0.1, "momentum": 0.9, "weight_decay": 1e-4}, +) + +# %% [markdown] +# Let's also train the pre-activation ResNet as comparison: + +# %% +resnetpreact_model, resnetpreact_results = train_model( + model_name="ResNet", + model_hparams={ + "num_classes": 10, + "c_hidden": [16, 32, 64], + "num_blocks": [3, 3, 3], + "act_fn_name": "relu", + "block_name": "PreActResNetBlock", + }, + optimizer_name="SGD", + optimizer_hparams={"lr": 0.1, "momentum": 0.9, "weight_decay": 1e-4}, + save_name="ResNetPreAct", +) + +# %% [markdown] +# ### Tensorboard log +# +# Similarly to our GoogleNet model, we also have a TensorBoard log for the ResNet model. We can open it below. + +# %% +# Opens tensorboard in notebook. Adjust the path to your CHECKPOINT_PATH! Feel free to change "ResNet" to "ResNetPreAct" +# %tensorboard --logdir ../saved_models/tutorial5/tensorboards/ResNet/ + +# %% [markdown] +#
+# +# Feel free to explore the TensorBoard yourself, including the computation graph. +# In general, we can see that with SGD, the ResNet has a higher training loss than the GoogleNet in the first stage of the training. +# After reducing the learning rate however, the model achieves even higher validation accuracies. +# We compare the precise scores at the end of the notebook. + +# %% [markdown] +# ## DenseNet +# +#
+# +# [DenseNet](https://arxiv.org/abs/1608.06993) is another architecture for enabling very deep neural networks and takes a slightly different perspective on residual connections. +# Instead of modeling the difference between layers, DenseNet considers residual connections as a possible way to reuse features across layers, removing any necessity to learn redundant feature maps. +# If we go deeper into the network, the model learns abstract features to recognize patterns. +# However, some complex patterns consist of a combination of abstract features (e.g. hand, face, etc. +# ), and low-level features (e.g. edges, basic color, etc.). +# To find these low-level features in the deep layers, standard CNNs have to learn copy such feature maps, which wastes a lot of parameter complexity. +# DenseNet provides an efficient way of reusing features by having each convolution depends on all previous input features, but add only a small amount of filters to it. +# See the figure below for an illustration (figure credit - [Hu et al. ](https://arxiv.org/abs/1608.06993)): +# +#
+# +# The last layer, called the transition layer, is responsible for reducing the dimensionality of the feature maps in height, width, and channel size. +# Although those technically break the identity backpropagation, there are only a few in a network so that it doesn't affect the gradient flow much. +# +# We split the implementation of the layers in DenseNet into three parts: a `DenseLayer`, and a `DenseBlock`, and a `TransitionLayer`. +# The module `DenseLayer` implements a single layer inside a dense block. +# It applies a 1x1 convolution for dimensionality reduction with a subsequential 3x3 convolution. +# The output channels are concatenated to the originals and returned. +# Note that we apply the Batch Normalization as the first layer of each block. +# This allows slightly different activations for the same features to different layers, depending on what is needed. +# Overall, we can implement it as follows: + + +# %% +class DenseLayer(nn.Module): + def __init__(self, c_in, bn_size, growth_rate, act_fn): + """DenseLayer. + + Args: + c_in - Number of input channels + bn_size - Bottleneck size (factor of growth rate) for the output of the 1x1 convolution. Typically between 2 and 4. + growth_rate - Number of output channels of the 3x3 convolution + act_fn - Activation class constructor (e.g. nn.ReLU) + """ + super().__init__() + self.net = nn.Sequential( + nn.BatchNorm2d(c_in), + act_fn(), + nn.Conv2d(c_in, bn_size * growth_rate, kernel_size=1, bias=False), + nn.BatchNorm2d(bn_size * growth_rate), + act_fn(), + nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False), + ) + + def forward(self, x): + out = self.net(x) + out = torch.cat([out, x], dim=1) + return out + + +# %% [markdown] +# The module `DenseBlock` summarizes multiple dense layers applied in sequence. +# Each dense layer takes as input the original input concatenated with all previous layers' feature maps: + + +# %% +class DenseBlock(nn.Module): + def __init__(self, c_in, num_layers, bn_size, growth_rate, act_fn): + """Dense Block. + + Args: + c_in - Number of input channels + num_layers - Number of dense layers to apply in the block + bn_size - Bottleneck size to use in the dense layers + growth_rate - Growth rate to use in the dense layers + act_fn - Activation function to use in the dense layers + """ + super().__init__() + layers = [] + for layer_idx in range(num_layers): + # Input channels are original plus the feature maps from previous layers + layer_c_in = c_in + layer_idx * growth_rate + layers.append(DenseLayer(c_in=layer_c_in, bn_size=bn_size, growth_rate=growth_rate, act_fn=act_fn)) + self.block = nn.Sequential(*layers) + + def forward(self, x): + out = self.block(x) + return out + + +# %% [markdown] +# Finally, the `TransitionLayer` takes as input the final output of a dense block and reduces its channel dimensionality using a 1x1 convolution. +# To reduce the height and width dimension, we take a slightly different approach than in ResNet and apply an average pooling with kernel size 2 and stride 2. +# This is because we don't have an additional connection to the output that would consider the full 2x2 patch instead of a single value. +# Besides, it is more parameter efficient than using a 3x3 convolution with stride 2. +# Thus, the layer is implemented as follows: + + +# %% +class TransitionLayer(nn.Module): + def __init__(self, c_in, c_out, act_fn): + super().__init__() + self.transition = nn.Sequential( + nn.BatchNorm2d(c_in), + act_fn(), + nn.Conv2d(c_in, c_out, kernel_size=1, bias=False), + nn.AvgPool2d(kernel_size=2, stride=2), # Average the output for each 2x2 pixel group + ) + + def forward(self, x): + return self.transition(x) + + +# %% [markdown] +# Now we can put everything together and create our DenseNet. +# To specify the number of layers, we use a similar notation as in ResNets and pass on a list of ints representing the number of layers per block. +# After each dense block except the last one, we apply a transition layer to reduce the dimensionality by 2. + + +# %% +class DenseNet(nn.Module): + def __init__( + self, num_classes=10, num_layers=[6, 6, 6, 6], bn_size=2, growth_rate=16, act_fn_name="relu", **kwargs + ): + super().__init__() + self.hparams = SimpleNamespace( + num_classes=num_classes, + num_layers=num_layers, + bn_size=bn_size, + growth_rate=growth_rate, + act_fn_name=act_fn_name, + act_fn=act_fn_by_name[act_fn_name], + ) + self._create_network() + self._init_params() + + def _create_network(self): + c_hidden = self.hparams.growth_rate * self.hparams.bn_size # The start number of hidden channels + + # A first convolution on the original image to scale up the channel size + self.input_net = nn.Sequential( + # No batch norm or activation function as done inside the Dense layers + nn.Conv2d(3, c_hidden, kernel_size=3, padding=1) + ) + + # Creating the dense blocks, eventually including transition layers + blocks = [] + for block_idx, num_layers in enumerate(self.hparams.num_layers): + blocks.append( + DenseBlock( + c_in=c_hidden, + num_layers=num_layers, + bn_size=self.hparams.bn_size, + growth_rate=self.hparams.growth_rate, + act_fn=self.hparams.act_fn, + ) + ) + c_hidden = c_hidden + num_layers * self.hparams.growth_rate # Overall output of the dense block + if block_idx < len(self.hparams.num_layers) - 1: # Don't apply transition layer on last block + blocks.append(TransitionLayer(c_in=c_hidden, c_out=c_hidden // 2, act_fn=self.hparams.act_fn)) + c_hidden = c_hidden // 2 + + self.blocks = nn.Sequential(*blocks) + + # Mapping to classification output + self.output_net = nn.Sequential( + nn.BatchNorm2d(c_hidden), # The features have not passed a non-linearity until here. + self.hparams.act_fn(), + nn.AdaptiveAvgPool2d((1, 1)), + nn.Flatten(), + nn.Linear(c_hidden, self.hparams.num_classes), + ) + + def _init_params(self): + # Based on our discussion in Tutorial 4, we should initialize the + # convolutions according to the activation function + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, nonlinearity=self.hparams.act_fn_name) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.input_net(x) + x = self.blocks(x) + x = self.output_net(x) + return x + + +# %% [markdown] +# Let's also add the DenseNet to our model dictionary: + +# %% +model_dict["DenseNet"] = DenseNet + +# %% [markdown] +# Lastly, we train our network. +# In contrast to ResNet, DenseNet does not show any issues with Adam, and hence we train it with this optimizer. +# The other hyperparameters are chosen to result in a network with a similar parameter size as the ResNet and GoogleNet. +# Commonly, when designing very deep networks, DenseNet is more parameter +# efficient than ResNet while achieving a similar or even better +# performance. + +# %% +densenet_model, densenet_results = train_model( + model_name="DenseNet", + model_hparams={ + "num_classes": 10, + "num_layers": [6, 6, 6, 6], + "bn_size": 2, + "growth_rate": 16, + "act_fn_name": "relu", + }, + optimizer_name="Adam", + optimizer_hparams={"lr": 1e-3, "weight_decay": 1e-4}, +) + +# %% [markdown] +# ### Tensorboard log +# +# Finally, we also have another TensorBoard for the DenseNet training. We take a look at it below: + +# %% +# Opens tensorboard in notebook. Adjust the path to your CHECKPOINT_PATH! Feel free to change "ResNet" to "ResNetPreAct" +# %tensorboard --logdir ../saved_models/tutorial5/tensorboards/DenseNet/ + +# %% [markdown] +#
+# +# The overall course of the validation accuracy and training loss resemble the training of GoogleNet, which is also related to training the network with Adam. +# Feel free to explore the training metrics yourself. + +# %% [markdown] +# ## Conclusion and Comparison +# +# After discussing each model separately, and training all of them, we can finally compare them. +# First, let's organize the results of all models in a table: + +# %% language="html" +# +# + +# %% +all_models = [ + ("GoogleNet", googlenet_results, googlenet_model), + ("ResNet", resnet_results, resnet_model), + ("ResNetPreAct", resnetpreact_results, resnetpreact_model), + ("DenseNet", densenet_results, densenet_model), +] +table = [ + [ + model_name, + f"{100.0*model_results['val']:4.2f}%", + f"{100.0*model_results['test']:4.2f}%", + f"{sum(np.prod(p.shape) for p in model.parameters()):,}", + ] + for model_name, model_results, model in all_models +] +display( + HTML( + tabulate.tabulate(table, tablefmt="html", headers=["Model", "Val Accuracy", "Test Accuracy", "Num Parameters"]) + ) +) + +# %% [markdown] +# First of all, we see that all models are performing reasonably well. +# Simple models as you have implemented them in the practical achieve considerably lower performance, which is beside the lower number of parameters also attributed to the architecture design choice. +# GoogleNet is the model to obtain the lowest performance on the validation and test set, although it is very close to DenseNet. +# A proper hyperparameter search over all the channel sizes in GoogleNet would likely improve the accuracy of the model to a similar level, but this is also expensive given a large number of hyperparameters. +# ResNet outperforms both DenseNet and GoogleNet by more than 1% on the validation set, while there is a minor difference between both versions, original and pre-activation. +# We can conclude that for shallow networks, the place of the activation function does not seem to be crucial, although papers have reported the contrary for very deep networks (e.g. [He et al. ](https://arxiv.org/abs/1603.05027)). +# +# In general, we can conclude that ResNet is a simple, but powerful architecture. +# If we would apply the models on more complex tasks with larger images and more layers inside the networks, we would likely see a bigger gap between GoogleNet and skip-connection architectures like ResNet and DenseNet. +# A comparison with deeper models on CIFAR10 can be for example found [here](https://github.com/kuangliu/pytorch-cifar). +# Interestingly, DenseNet outperforms the original ResNet on their setup but comes closely behind the Pre-Activation ResNet. +# The best model, a Dual Path Network ([Chen et. +# al](https://arxiv.org/abs/1707.01629)), is actually a combination of +# ResNet and DenseNet showing that both offer different advantages. + +# %% [markdown] +# ### Which model should I choose for my task? +# +# We have reviewed four different models. +# So, which one should we choose if have given a new task? +# Usually, starting with a ResNet is a good idea given the superior performance of the CIFAR dataset and its simple implementation. +# Besides, for the parameter number we have chosen here, ResNet is the fastest as DenseNet and GoogleNet have many more layers that are applied in sequence in our primitive implementation. +# However, if you have a really difficult task, such as semantic +# segmentation on HD images, more complex variants of ResNet and DenseNet +# are recommended. diff --git a/course_UvA-DL/04-inception-resnet-densenet/densenet_block.svg b/course_UvA-DL/04-inception-resnet-densenet/densenet_block.svg new file mode 100644 index 0000000..7b7c5a5 --- /dev/null +++ b/course_UvA-DL/04-inception-resnet-densenet/densenet_block.svg @@ -0,0 +1 @@ + diff --git a/course_UvA-DL/04-inception-resnet-densenet/inception_block.svg b/course_UvA-DL/04-inception-resnet-densenet/inception_block.svg new file mode 100644 index 0000000..be62455 --- /dev/null +++ b/course_UvA-DL/04-inception-resnet-densenet/inception_block.svg @@ -0,0 +1,1290 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/04-inception-resnet-densenet/resnet_block.svg b/course_UvA-DL/04-inception-resnet-densenet/resnet_block.svg new file mode 100644 index 0000000..f5977dc --- /dev/null +++ b/course_UvA-DL/04-inception-resnet-densenet/resnet_block.svg @@ -0,0 +1,1194 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/04-inception-resnet-densenet/resnet_loss_surface.png b/course_UvA-DL/04-inception-resnet-densenet/resnet_loss_surface.png new file mode 100644 index 0000000..01a97d8 Binary files /dev/null and b/course_UvA-DL/04-inception-resnet-densenet/resnet_loss_surface.png differ diff --git a/course_UvA-DL/04-inception-resnet-densenet/resnet_notation.svg b/course_UvA-DL/04-inception-resnet-densenet/resnet_notation.svg new file mode 100644 index 0000000..959a4f0 --- /dev/null +++ b/course_UvA-DL/04-inception-resnet-densenet/resnet_notation.svg @@ -0,0 +1,3 @@ + + +
ResNet Block 1
ResNet Block 1
ResNet Block 2
ResNet Block 2
ResNet Block 3
ResNet Block 3
ResNet Block 4
ResNet Block 4
ResNet Block 5
ResNet Block 5
ResNet Block 6
ResNet Block 6
ResNet Block 7
ResNet Block 7
ResNet Block 8
ResNet Block 8
ResNet Block 9
ResNet Block 9
Group 1 (32x32)
Group 1 (32x32)
Group 2 (16x16)
Group 2 (16x16)
Group 3 (8x8)
Group 3 (8x8)
diff --git a/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_DenseNet.png b/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_DenseNet.png new file mode 100644 index 0000000..7302773 Binary files /dev/null and b/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_DenseNet.png differ diff --git a/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_GoogleNet.png b/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_GoogleNet.png new file mode 100644 index 0000000..36341ce Binary files /dev/null and b/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_GoogleNet.png differ diff --git a/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_ResNet.png b/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_ResNet.png new file mode 100644 index 0000000..e40f156 Binary files /dev/null and b/course_UvA-DL/04-inception-resnet-densenet/tensorboard_screenshot_ResNet.png differ diff --git a/course_UvA-DL/05-transformers-and-MH-attention/.meta.yml b/course_UvA-DL/05-transformers-and-MH-attention/.meta.yml new file mode 100644 index 0000000..0c8a0ee --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/.meta.yml @@ -0,0 +1,24 @@ +title: "Tutorial 5: Transformers and Multi-Head Attention" +author: Phillip Lippe +created: 2021-06-30 +updated: 2023-03-14 +license: CC BY-SA +build: 0 +tags: + - Text +description: | + In this tutorial, we will discuss one of the most impactful architectures of the last 2 years: the Transformer model. + Since the paper Attention Is All You Need by Vaswani et al. had been published in 2017, + the Transformer architecture has continued to beat benchmarks in many domains, most importantly in Natural Language Processing. + Transformers with an incredible amount of parameters can generate long, convincing essays, and opened up new application fields of AI. + As the hype of the Transformer architecture seems not to come to an end in the next years, + it is important to understand how it works, and have implemented it yourself, which we will do in this notebook. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - GPU diff --git a/course_UvA-DL/05-transformers-and-MH-attention/.thumb.jpg b/course_UvA-DL/05-transformers-and-MH-attention/.thumb.jpg new file mode 100644 index 0000000..e644f9a Binary files /dev/null and b/course_UvA-DL/05-transformers-and-MH-attention/.thumb.jpg differ diff --git a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py new file mode 100644 index 0000000..753b368 --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py @@ -0,0 +1,1609 @@ +# %% [markdown] +#
+# Despite the huge success of Transformers in NLP, we will _not_ include the NLP domain in our notebook here. +# There are many courses at the University of Amsterdam that focus on Natural Language Processing +# and take a closer look at the application of the Transformer architecture in NLP +# ([NLP2](https://studiegids.uva.nl/xmlpages/page/2020-2021/zoek-vak/vak/79628), +# [Advanced Topics in Computational Semantics](https://studiegids.uva.nl/xmlpages/page/2020-2021/zoek-vak/vak/80162)). +# Furthermore, and most importantly, there is so much more to the Transformer architecture. +# NLP is the domain the Transformer architecture has been originally proposed for and had the greatest impact on, +# but it also accelerated research in other domains, recently even [Computer Vision](https://arxiv.org/abs/2010.11929). +# Thus, we focus here on what makes the Transformer and self-attention so powerful in general. +# In a second notebook, we will look at Vision Transformers, i.e. Transformers for image classification +# ([link to notebook](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial15/Vision_Transformer.html)). +# +# Below, we import our standard libraries. + +# %% +# Standard libraries +import math +import os +import urllib.request +from functools import partial +from urllib.error import HTTPError + +# PyTorch Lightning +import lightning as L + +# Plotting +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns + +# PyTorch +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data + +# Torchvision +import torchvision +from lightning.pytorch.callbacks import ModelCheckpoint +from torchvision import transforms +from torchvision.datasets import CIFAR100 +from tqdm.notebook import tqdm + +plt.set_cmap("cividis") +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.reset_orig() + +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/Transformers/") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") +print("Device:", device) + +# %% [markdown] +# Two pre-trained models are downloaded below. +# Make sure to have adjusted your `CHECKPOINT_PATH` before running this code if not already done. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial6/" +# Files to download +pretrained_files = ["ReverseTask.ckpt", "SetAnomalyTask.ckpt"] + +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if "/" in file_name: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file manually," + " or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## The Transformer architecture +# +# In the first part of this notebook, we will implement the Transformer architecture by hand. +# As the architecture is so popular, there already exists a Pytorch module `nn.Transformer` +# ([documentation](https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html)) +# and a [tutorial](https://pytorch.org/tutorials/beginner/transformer_tutorial.html) +# on how to use it for next token prediction. +# However, we will implement it here ourselves, to get through to the smallest details. +# +# There are of course many more tutorials out there about attention and Transformers. +# Below, we list a few that are worth exploring if you are interested in the topic +# and might want yet another perspective on the topic after this one: +# +# * [Transformer: A Novel Neural Network Architecture for Language Understanding +# (Jakob Uszkoreit, 2017)](https://ai.googleblog.com/2017/08/transformer-novel-neural-network.html) - The original Google blog post about the Transformer paper, focusing on the application in machine translation. +# * [The Illustrated Transformer (Jay Alammar, 2018)](http://jalammar.github.io/illustrated-transformer/) - A very popular and great blog post intuitively explaining the Transformer architecture with many nice visualizations. +# The focus is on NLP. +# * [Attention? +# Attention! +# (Lilian Weng, 2018)](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html) - A nice blog post summarizing attention mechanisms in many domains including vision. +# * [Illustrated: Self-Attention (Raimi Karim, 2019)](https://towardsdatascience.com/illustrated-self-attention-2d627e33b20a) - A nice visualization of the steps of self-attention. +# Recommended going through if the explanation below is too abstract for you. +# * [The Transformer family (Lilian Weng, 2020)](https://lilianweng.github.io/lil-log/2020/04/07/the-transformer-family.html) - A very detailed blog post reviewing more variants of Transformers besides the original one. + +# %% [markdown] +# ### What is Attention? +# +# The attention mechanism describes a recent new group of layers in neural networks that has attracted +# a lot of interest in the past few years, especially in sequence tasks. +# There are a lot of different possible definitions of "attention" in the literature, +# but the one we will use here is the following: _the attention mechanism describes a weighted average +# of (sequence) elements with the weights dynamically computed based on an input query and elements' keys_. +# So what does this exactly mean? +# The goal is to take an average over the features of multiple elements. +# However, instead of weighting each element equally, we want to weight them depending on their actual values. +# In other words, we want to dynamically decide on which inputs we want to "attend" more than others. +# In particular, an attention mechanism has usually four parts we need to specify: +# +# * **Query**: The query is a feature vector that describes what we are looking for in the sequence, i.e. what would we maybe want to pay attention to. +# * **Keys**: For each input element, we have a key which is again a feature vector. +# This feature vector roughly describes what the element is "offering", or when it might be important. +# The keys should be designed such that we can identify the elements we want to pay attention to based on the query. +# * **Values**: For each input element, we also have a value vector. +# This feature vector is the one we want to average over. +# * **Score function**: To rate which elements we want to pay attention to, we need to specify a score function $f_{attn}$. +# The score function takes the query and a key as input, and output the score/attention weight of the query-key pair. +# It is usually implemented by simple similarity metrics like a dot product, or a small MLP. +# +# +# The weights of the average are calculated by a softmax over all score function outputs. +# Hence, we assign those value vectors a higher weight whose corresponding key is most similar to the query. +# If we try to describe it with pseudo-math, we can write: +# +# $$ +# \alpha_i = \frac{\exp\left(f_{attn}\left(\text{key}_i, \text{query}\right)\right)}{\sum_j \exp\left(f_{attn}\left(\text{key}_j, \text{query}\right)\right)}, \hspace{5mm} \text{out} = \sum_i \alpha_i \cdot \text{value}_i +# $$ +# +# Visually, we can show the attention over a sequence of words as follows: +# +#
+# +# For every word, we have one key and one value vector. +# The query is compared to all keys with a score function (in this case the dot product) to determine the weights. +# The softmax is not visualized for simplicity. +# Finally, the value vectors of all words are averaged using the attention weights. +# +# Most attention mechanisms differ in terms of what queries they use, how the key and value vectors are defined, +# and what score function is used. +# The attention applied inside the Transformer architecture is called **self-attention**. +# In self-attention, each sequence element provides a key, value, and query. +# For each element, we perform an attention layer where based on its query, +# we check the similarity of the all sequence elements' keys, and returned a different, +# averaged value vector for each element. +# We will now go into a bit more detail by first looking at the specific implementation of the attention mechanism +# which is in the Transformer case the scaled dot product attention. + +# %% [markdown] +# ### Scaled Dot Product Attention +# +# The core concept behind self-attention is the scaled dot product attention. +# Our goal is to have an attention mechanism with which any element in a sequence can attend to any other while +# still being efficient to compute. +# The dot product attention takes as input a set of queries +# $Q\in\mathbb{R}^{T\times d_k}$, keys $K\in\mathbb{R}^{T\times d_k}$ +# and values $V\in\mathbb{R}^{T\times d_v}$ where $T$ is the sequence length, +# and $d_k$ and $d_v$ are the hidden dimensionality for queries/keys and values respectively. +# For simplicity, we neglect the batch dimension for now. +# The attention value from element $i$ to $j$ is based on its similarity of the query $Q_i$ and key $K_j$, +# using the dot product as the similarity metric. +# In math, we calculate the dot product attention as follows: +# +# $$\text{Attention}(Q,K,V)=\text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$$ +# +# The matrix multiplication $QK^T$ performs the dot product for every possible pair of queries and keys, +# resulting in a matrix of the shape $T\times T$. +# Each row represents the attention logits for a specific element $i$ to all other elements in the sequence. +# On these, we apply a softmax and multiply with the value vector to obtain a weighted mean +# (the weights being determined by the attention). +# Another perspective on this attention mechanism offers the computation graph which is visualized below +# (figure credit - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)). +# +#
+# +# One aspect we haven't discussed yet is the scaling factor of $1/\sqrt{d_k}$. +# This scaling factor is crucial to maintain an appropriate variance of attention values after initialization. +# Remember that we intialize our layers with the intention of having equal variance throughout the model, and hence, +# $Q$ and $K$ might also have a variance close to $1$. +# However, performing a dot product over two vectors with a variance $\sigma$ results +# in a scalar having $d_k$-times higher variance: +# +# $$q_i \sim \mathcal{N}(0,\sigma), k_i \sim \mathcal{N}(0,\sigma) \to \text{Var}\left(\sum_{i=1}^{d_k} q_i\cdot k_i\right) = \sigma\cdot d_k$$ +# +# +# If we do not scale down the variance back to $\sigma$, the softmax over the logits will already saturate +# to $1$ for one random element and $0$ for all others. +# The gradients through the softmax will be close to zero so that we can't learn the parameters appropriately. +# +# The block `Mask (opt. +# )` in the diagram above represents the optional masking of specific entries in the attention matrix. +# This is for instance used if we stack multiple sequences with different lengths into a batch. +# To still benefit from parallelization in PyTorch, we pad the sentences to the same length and mask out the padding +# tokens during the calculation of the attention values. +# This is usually done by setting the respective attention logits to a very low value. +# +# After we have discussed the details of the scaled dot product attention block, we can write a function below +# which computes the output features given the triple of queries, keys, and values: + + +# %% +def scaled_dot_product(q, k, v, mask=None): + d_k = q.size()[-1] + attn_logits = torch.matmul(q, k.transpose(-2, -1)) + attn_logits = attn_logits / math.sqrt(d_k) + if mask is not None: + attn_logits = attn_logits.masked_fill(mask == 0, -9e15) + attention = F.softmax(attn_logits, dim=-1) + values = torch.matmul(attention, v) + return values, attention + + +# %% [markdown] +# Note that our code above supports any additional dimensionality in front of the sequence length +# so that we can also use it for batches. +# However, for a better understanding, let's generate a few random queries, keys, and value vectors, +# and calculate the attention outputs: + +# %% +seq_len, d_k = 3, 2 +L.seed_everything(42) +q = torch.randn(seq_len, d_k) +k = torch.randn(seq_len, d_k) +v = torch.randn(seq_len, d_k) +values, attention = scaled_dot_product(q, k, v) +print("Q\n", q) +print("K\n", k) +print("V\n", v) +print("Values\n", values) +print("Attention\n", attention) + +# %% [markdown] +# Before continuing, make sure you can follow the calculation of the specific values here, and also check it by hand. +# It is important to fully understand how the scaled dot product attention is calculated. + +# %% [markdown] +# ### Multi-Head Attention +# +# The scaled dot product attention allows a network to attend over a sequence. +# However, often there are multiple different aspects a sequence element wants to attend to, +# and a single weighted average is not a good option for it. +# This is why we extend the attention mechanisms to multiple heads, +# i.e. multiple different query-key-value triplets on the same features. +# Specifically, given a query, key, and value matrix, we transform those into $h$ sub-queries, sub-keys, +# and sub-values, which we pass through the scaled dot product attention independently. +# Afterward, we concatenate the heads and combine them with a final weight matrix. +# Mathematically, we can express this operation as: +# +# $$ +# \begin{split} +# \text{Multihead}(Q,K,V) & = \text{Concat}(\text{head}_1,...,\text{head}_h)W^{O}\\ +# \text{where } \text{head}_i & = \text{Attention}(QW_i^Q,KW_i^K, VW_i^V) +# \end{split} +# $$ +# +# We refer to this as Multi-Head Attention layer with the learnable parameters +# $W_{1...h}^{Q}\in\mathbb{R}^{D\times d_k}$, +# $W_{1...h}^{K}\in\mathbb{R}^{D\times d_k}$, +# $W_{1...h}^{V}\in\mathbb{R}^{D\times d_v}$, +# and $W^{O}\in\mathbb{R}^{h\cdot d_k\times d_{out}}$ ($D$ being the input dimensionality). +# Expressed in a computational graph, we can visualize it as below +# (figure credit - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)). +# +#
+# +# How are we applying a Multi-Head Attention layer in a neural network, +# where we don't have an arbitrary query, key, and value vector as input? +# Looking at the computation graph above, a simple but effective implementation is to set the current +# feature map in a NN, $X\in\mathbb{R}^{B\times T\times d_{\text{model}}}$, as $Q$, $K$ and $V$ +# ($B$ being the batch size, $T$ the sequence length, $d_{\text{model}}$ the hidden dimensionality of $X$). +# The consecutive weight matrices $W^{Q}$, $W^{K}$, and $W^{V}$ can transform $X$ to the corresponding +# feature vectors that represent the queries, keys, and values of the input. +# Using this approach, we can implement the Multi-Head Attention module below. + + +# %% +class MultiheadAttention(nn.Module): + def __init__(self, input_dim, embed_dim, num_heads): + super().__init__() + assert embed_dim % num_heads == 0, "Embedding dimension must be 0 modulo number of heads." + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + # Stack all weight matrices 1...h together for efficiency + # Note that in many implementations you see "bias=False" which is optional + self.qkv_proj = nn.Linear(input_dim, 3 * embed_dim) + self.o_proj = nn.Linear(embed_dim, embed_dim) + + self._reset_parameters() + + def _reset_parameters(self): + # Original Transformer initialization, see PyTorch documentation + nn.init.xavier_uniform_(self.qkv_proj.weight) + self.qkv_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.o_proj.weight) + self.o_proj.bias.data.fill_(0) + + def forward(self, x, mask=None, return_attention=False): + batch_size, seq_length, embed_dim = x.size() + qkv = self.qkv_proj(x) + + # Separate Q, K, V from linear output + qkv = qkv.reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim) + qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims] + q, k, v = qkv.chunk(3, dim=-1) + + # Determine value outputs + values, attention = scaled_dot_product(q, k, v, mask=mask) + values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims] + values = values.reshape(batch_size, seq_length, embed_dim) + o = self.o_proj(values) + + if return_attention: + return o, attention + else: + return o + + +# %% [markdown] +# One crucial characteristic of the multi-head attention is that it is permutation-equivariant with respect to its inputs. +# This means that if we switch two input elements in the sequence, e.g. $X_1\leftrightarrow X_2$ +# (neglecting the batch dimension for now), the output is exactly the same besides the elements 1 and 2 switched. +# Hence, the multi-head attention is actually looking at the input not as a sequence, but as a set of elements. +# This property makes the multi-head attention block and the Transformer architecture so powerful and widely applicable! +# But what if the order of the input is actually important for solving the task, like language modeling? +# The answer is to encode the position in the input features, which we will take a closer look at later +# (topic _Positional encodings_ below). +# +# Before moving on to creating the Transformer architecture, we can compare the self-attention operation +# with our other common layer competitors for sequence data: convolutions and recurrent neural networks. +# Below you can find a table by [Vaswani et al. +# (2017)](https://arxiv.org/abs/1706.03762) on the complexity per layer, the number of sequential operations, +# and maximum path length. +# The complexity is measured by the upper bound of the number of operations to perform, while the maximum path +# length represents the maximum number of steps a forward or backward signal has to traverse to reach any other position. +# The lower this length, the better gradient signals can backpropagate for long-range dependencies. +# Let's take a look at the table below: +# +# +#
+# +# $n$ is the sequence length, $d$ is the representation dimension and $k$ is the kernel size of convolutions. +# In contrast to recurrent networks, the self-attention layer can parallelize all its operations making it much faster +# to execute for smaller sequence lengths. +# However, when the sequence length exceeds the hidden dimensionality, self-attention becomes more expensive than RNNs. +# One way of reducing the computational cost for long sequences is by restricting the self-attention to a neighborhood +# of inputs to attend over, denoted by $r$. +# Nevertheless, there has been recently a lot of work on more efficient Transformer architectures that still allow long +# dependencies, of which you can find an overview in the paper by [Tay et al. +# (2020)](https://arxiv.org/abs/2009.06732) if interested. + +# %% [markdown] +# ### Transformer Encoder +# +#
+# +# Next, we will look at how to apply the multi-head attention blog inside the Transformer architecture. +# Originally, the Transformer model was designed for machine translation. +# Hence, it got an encoder-decoder structure where the encoder takes as input the sentence in the original language +# and generates an attention-based representation. +# On the other hand, the decoder attends over the encoded information and generates the translated sentence +# in an autoregressive manner, as in a standard RNN. +# While this structure is extremely useful for Sequence-to-Sequence tasks with the necessity of autoregressive decoding, +# we will focus here on the encoder part. +# Many advances in NLP have been made using pure encoder-based Transformer models (if interested, models include the +# [BERT](https://arxiv.org/abs/1810.04805)-family, +# the [Vision Transformer](https://arxiv.org/abs/2010.11929), and more), +# and in our tutorial, we will also mainly focus on the encoder part. +# If you have understood the encoder architecture, the decoder is a very small step to implement as well. +# The full Transformer architecture looks as follows +# (figure credit - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)). +# : +# +#
+# +# The encoder consists of $N$ identical blocks that are applied in sequence. +# Taking as input $x$, it is first passed through a Multi-Head Attention block as we have implemented above. +# The output is added to the original input using a residual connection, +# and we apply a consecutive Layer Normalization on the sum. +# Overall, it calculates $\text{LayerNorm}(x+\text{Multihead}(x,x,x))$ +# ($x$ being $Q$, $K$ and $V$ input to the attention layer). +# The residual connection is crucial in the Transformer architecture for two reasons: +# +# 1. +# Similar to ResNets, Transformers are designed to be very deep. +# Some models contain more than 24 blocks in the encoder. +# Hence, the residual connections are crucial for enabling a smooth gradient flow through the model. +# 2. +# Without the residual connection, the information about the original sequence is lost. +# Remember that the Multi-Head Attention layer ignores the position of elements in a sequence, +# and can only learn it based on the input features. +# Removing the residual connections would mean that this information is lost after the first attention layer +# (after initialization), and with a randomly initialized query and key vector, +# the output vectors for position $i$ has no relation to its original input. +# All outputs of the attention are likely to represent similar/same information, +# and there is no chance for the model to distinguish which information came from which input element. +# An alternative option to residual connection would be to fix at least one head to focus on its original input, +# but this is very inefficient and does not have the benefit of the improved gradient flow. +# +# The Layer Normalization also plays an important role in the Transformer architecture as it enables faster +# training and provides small regularization. +# Additionally, it ensures that the features are in a similar magnitude among the elements in the sequence. +# We are not using Batch Normalization because it depends on the batch size which is often small with Transformers +# (they require a lot of GPU memory), and BatchNorm has shown to perform particularly bad in language +# as the features of words tend to have a much higher variance (there are many, very rare words +# which need to be considered for a good distribution estimate). +# +# Additionally to the Multi-Head Attention, a small fully connected feed-forward network is added to the model, +# which is applied to each position separately and identically. +# Specifically, the model uses a Linear$\to$ReLU$\to$Linear MLP. +# The full transformation including the residual connection can be expressed as: +# +# $$ +# \begin{split} +# \text{FFN}(x) & = \max(0, xW_1+b_1)W_2 + b_2\\ +# x & = \text{LayerNorm}(x + \text{FFN}(x)) +# \end{split} +# $$ +# +# This MLP adds extra complexity to the model and allows transformations on each sequence element separately. +# You can imagine as this allows the model to "post-process" the new information added +# by the previous Multi-Head Attention, and prepare it for the next attention block. +# Usually, the inner dimensionality of the MLP is 2-8$\times$ larger than $d_{\text{model}}$, +# i.e. the dimensionality of the original input $x$. +# The general advantage of a wider layer instead of a narrow, multi-layer MLP is the faster, parallelizable execution. +# +# Finally, after looking at all parts of the encoder architecture, we can start implementing it below. +# We first start by implementing a single encoder block. +# Additionally to the layers described above, we will add dropout layers in the MLP and on the output +# of the MLP and Multi-Head Attention for regularization. + + +# %% +class EncoderBlock(nn.Module): + def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0): + """EncoderBlock. + + Args: + input_dim: Dimensionality of the input + num_heads: Number of heads to use in the attention block + dim_feedforward: Dimensionality of the hidden layer in the MLP + dropout: Dropout probability to use in the dropout layers + """ + super().__init__() + + # Attention layer + self.self_attn = MultiheadAttention(input_dim, input_dim, num_heads) + + # Two-layer MLP + self.linear_net = nn.Sequential( + nn.Linear(input_dim, dim_feedforward), + nn.Dropout(dropout), + nn.ReLU(inplace=True), + nn.Linear(dim_feedforward, input_dim), + ) + + # Layers to apply in between the main layers + self.norm1 = nn.LayerNorm(input_dim) + self.norm2 = nn.LayerNorm(input_dim) + self.dropout = nn.Dropout(dropout) + + def forward(self, x, mask=None): + # Attention part + attn_out = self.self_attn(x, mask=mask) + x = x + self.dropout(attn_out) + x = self.norm1(x) + + # MLP part + linear_out = self.linear_net(x) + x = x + self.dropout(linear_out) + x = self.norm2(x) + + return x + + +# %% [markdown] +# Based on this block, we can implement a module for the full Transformer encoder. +# Additionally to a forward function that iterates through the sequence of encoder blocks, +# we also provide a function called `get_attention_maps`. +# The idea of this function is to return the attention probabilities for all Multi-Head Attention blocks in the encoder. +# This helps us in understanding, and in a sense, explaining the model. +# However, the attention probabilities should be interpreted with a grain of salt as it does not necessarily +# reflect the true interpretation of the model (there is a series of papers about this, +# including [Attention is not Explanation](https://arxiv.org/abs/1902.10186) +# and [Attention is not not Explanation](https://arxiv.org/abs/1908.04626)). + + +# %% +class TransformerEncoder(nn.Module): + def __init__(self, num_layers, **block_args): + super().__init__() + self.layers = nn.ModuleList([EncoderBlock(**block_args) for _ in range(num_layers)]) + + def forward(self, x, mask=None): + for layer in self.layers: + x = layer(x, mask=mask) + return x + + def get_attention_maps(self, x, mask=None): + attention_maps = [] + for layer in self.layers: + _, attn_map = layer.self_attn(x, mask=mask, return_attention=True) + attention_maps.append(attn_map) + x = layer(x) + return attention_maps + + +# %% [markdown] +# ### Positional encoding +# +# We have discussed before that the Multi-Head Attention block is permutation-equivariant, +# and cannot distinguish whether an input comes before another one in the sequence or not. +# In tasks like language understanding, however, the position is important for interpreting the input words. +# The position information can therefore be added via the input features. +# We could learn a embedding for every possible position, but this would not generalize to a dynamical +# input sequence length. +# Hence, the better option is to use feature patterns that the network can identify from the features +# and potentially generalize to larger sequences. +# The specific pattern chosen by Vaswani et al. +# are sine and cosine functions of different frequencies, as follows: +# +# $$ +# PE_{(pos,i)} = \begin{cases} +# \sin\left(\frac{pos}{10000^{i/d_{\text{model}}}}\right) & \text{if}\hspace{3mm} i \text{ mod } 2=0\\ +# \cos\left(\frac{pos}{10000^{(i-1)/d_{\text{model}}}}\right) & \text{otherwise}\\ +# \end{cases} +# $$ +# +# $PE_{(pos,i)}$ represents the position encoding at position $pos$ in the sequence, and hidden dimensionality $i$. +# These values, concatenated for all hidden dimensions, are added to the original input features +# (in the Transformer visualization above, see "Positional encoding"), and constitute the position information. +# We distinguish between even ($i \text{ mod } 2=0$) and uneven ($i \text{ mod } 2=1$) +# hidden dimensionalities where we apply a sine/cosine respectively. +# The intuition behind this encoding is that you can represent $PE_{(pos+k,:)}$ as a linear function +# of $PE_{(pos,:)}$, which might allow the model to easily attend to relative positions. +# The wavelengths in different dimensions range from $2\pi$ to $10000\cdot 2\pi$. +# +# The positional encoding is implemented below. +# The code is taken from the [PyTorch tutorial](https://pytorch.org/tutorials/beginner/transformer_tutorial.html#define-the-model) +# about Transformers on NLP and adjusted for our purposes. + + +# %% +class PositionalEncoding(nn.Module): + def __init__(self, d_model, max_len=5000): + """Positional Encoding. + + Args: + d_model: Hidden dimensionality of the input. + max_len: Maximum length of a sequence to expect. + """ + super().__init__() + + # Create matrix of [SeqLen, HiddenDim] representing the positional encoding for max_len inputs + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + + # register_buffer => Tensor which is not a parameter, but should be part of the modules state. + # Used for tensors that need to be on the same device as the module. + # persistent=False tells PyTorch to not add the buffer to the state dict (e.g. when we save the model) + self.register_buffer("pe", pe, persistent=False) + + def forward(self, x): + x = x + self.pe[:, : x.size(1)] + return x + + +# %% [markdown] +# To understand the positional encoding, we can visualize it below. +# We will generate an image of the positional encoding over hidden dimensionality and position in a sequence. +# Each pixel, therefore, represents the change of the input feature we perform to encode the specific position. +# Let's do it below. + +# %% +encod_block = PositionalEncoding(d_model=48, max_len=96) +pe = encod_block.pe.squeeze().T.cpu().numpy() + +fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 3)) +pos = ax.imshow(pe, cmap="RdGy", extent=(1, pe.shape[1] + 1, pe.shape[0] + 1, 1)) +fig.colorbar(pos, ax=ax) +ax.set_xlabel("Position in sequence") +ax.set_ylabel("Hidden dimension") +ax.set_title("Positional encoding over hidden dimensions") +ax.set_xticks([1] + [i * 10 for i in range(1, 1 + pe.shape[1] // 10)]) +ax.set_yticks([1] + [i * 10 for i in range(1, 1 + pe.shape[0] // 10)]) +plt.show() + +# %% [markdown] +# You can clearly see the sine and cosine waves with different wavelengths that encode the position +# in the hidden dimensions. +# Specifically, we can look at the sine/cosine wave for each hidden dimension separately, +# to get a better intuition of the pattern. +# Below we visualize the positional encoding for the hidden dimensions $1$, $2$, $3$ and $4$. + +# %% +sns.set_theme() +fig, ax = plt.subplots(2, 2, figsize=(12, 4)) +ax = [a for a_list in ax for a in a_list] +for i in range(len(ax)): + ax[i].plot(np.arange(1, 17), pe[i, :16], color="C%i" % i, marker="o", markersize=6, markeredgecolor="black") + ax[i].set_title("Encoding in hidden dimension %i" % (i + 1)) + ax[i].set_xlabel("Position in sequence", fontsize=10) + ax[i].set_ylabel("Positional encoding", fontsize=10) + ax[i].set_xticks(np.arange(1, 17)) + ax[i].tick_params(axis="both", which="major", labelsize=10) + ax[i].tick_params(axis="both", which="minor", labelsize=8) + ax[i].set_ylim(-1.2, 1.2) +fig.subplots_adjust(hspace=0.8) +sns.reset_orig() +plt.show() + +# %% [markdown] +# As we can see, the patterns between the hidden dimension $1$ and $2$ only differ in the starting angle. +# The wavelength is $2\pi$, hence the repetition after position $6$. +# The hidden dimensions $2$ and $3$ have about twice the wavelength. + +# %% [markdown] +# ### Learning rate warm-up +# +# One commonly used technique for training a Transformer is learning rate warm-up. +# This means that we gradually increase the learning rate from 0 on to our originally specified +# learning rate in the first few iterations. +# Thus, we slowly start learning instead of taking very large steps from the beginning. +# In fact, training a deep Transformer without learning rate warm-up can make the model diverge +# and achieve a much worse performance on training and testing. +# Take for instance the following plot by [Liu et al. +# (2019)](https://arxiv.org/pdf/1908.03265.pdf) comparing Adam-vanilla (i.e. Adam without warm-up) +# vs Adam with a warm-up: +# +#
+# +# Clearly, the warm-up is a crucial hyperparameter in the Transformer architecture. +# Why is it so important? +# There are currently two common explanations. +# Firstly, Adam uses the bias correction factors which however can lead to a higher variance in the adaptive +# learning rate during the first iterations. +# Improved optimizers like [RAdam](https://arxiv.org/abs/1908.03265) have been shown to overcome this issue, +# not requiring warm-up for training Transformers. +# Secondly, the iteratively applied Layer Normalization across layers can lead to very high gradients during +# the first iterations, which can be solved by using Pre-Layer Normalization +# (similar to Pre-Activation ResNet), or replacing Layer Normalization by other techniques +# (Adaptive Normalization, +# [Power Normalization](https://arxiv.org/abs/2003.07845)). +# +# Nevertheless, many applications and papers still use the original Transformer architecture with Adam, +# because warm-up is a simple, yet effective way of solving the gradient problem in the first iterations. +# There are many different schedulers we could use. +# For instance, the original Transformer paper used an exponential decay scheduler with a warm-up. +# However, the currently most popular scheduler is the cosine warm-up scheduler, +# which combines warm-up with a cosine-shaped learning rate decay. +# We can implement it below, and visualize the learning rate factor over epochs. + + +# %% +class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler): + def __init__(self, optimizer, warmup, max_iters): + self.warmup = warmup + self.max_num_iters = max_iters + super().__init__(optimizer) + + def get_lr(self): + lr_factor = self.get_lr_factor(epoch=self.last_epoch) + return [base_lr * lr_factor for base_lr in self.base_lrs] + + def get_lr_factor(self, epoch): + lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters)) + if epoch <= self.warmup: + lr_factor *= epoch * 1.0 / self.warmup + return lr_factor + + +# %% +# Needed for initializing the lr scheduler +p = nn.Parameter(torch.empty(4, 4)) +optimizer = optim.Adam([p], lr=1e-3) +lr_scheduler = CosineWarmupScheduler(optimizer=optimizer, warmup=100, max_iters=2000) + +# Plotting +epochs = list(range(2000)) +sns.set() +plt.figure(figsize=(8, 3)) +plt.plot(epochs, [lr_scheduler.get_lr_factor(e) for e in epochs]) +plt.ylabel("Learning rate factor") +plt.xlabel("Iterations (in batches)") +plt.title("Cosine Warm-up Learning Rate Scheduler") +plt.show() +sns.reset_orig() + +# %% [markdown] +# In the first 100 iterations, we increase the learning rate factor from 0 to 1, +# whereas for all later iterations, we decay it using the cosine wave. +# Pre-implementations of this scheduler can be found in the popular NLP Transformer library +# [huggingface](https://huggingface.co/transformers/main_classes/optimizer_schedules.html?highlight=cosine#transformers.get_cosine_schedule_with_warmup). + +# %% [markdown] +# ### PyTorch Lightning Module +# +# Finally, we can embed the Transformer architecture into a PyTorch lightning module. +# From Tutorial 5, you know that PyTorch Lightning simplifies our training and test code, +# as well as structures the code nicely in separate functions. +# We will implement a template for a classifier based on the Transformer encoder. +# Thereby, we have a prediction output per sequence element. +# If we would need a classifier over the whole sequence, the common approach is to add an additional +# `[CLS]` token to the sequence, representing the classifier token. +# However, here we focus on tasks where we have an output per element. +# +# Additionally to the Transformer architecture, we add a small input network (maps input dimensions to model dimensions), +# the positional encoding, and an output network (transforms output encodings to predictions). +# We also add the learning rate scheduler, which takes a step each iteration instead of once per epoch. +# This is needed for the warmup and the smooth cosine decay. +# The training, validation, and test step is left empty for now and will be filled for our task-specific models. + + +# %% +class TransformerPredictor(L.LightningModule): + def __init__( + self, + input_dim, + model_dim, + num_classes, + num_heads, + num_layers, + lr, + warmup, + max_iters, + dropout=0.0, + input_dropout=0.0, + ): + """TransformerPredictor. + + Args: + input_dim: Hidden dimensionality of the input + model_dim: Hidden dimensionality to use inside the Transformer + num_classes: Number of classes to predict per sequence element + num_heads: Number of heads to use in the Multi-Head Attention blocks + num_layers: Number of encoder blocks to use. + lr: Learning rate in the optimizer + warmup: Number of warmup steps. Usually between 50 and 500 + max_iters: Number of maximum iterations the model is trained for. This is needed for the CosineWarmup scheduler + dropout: Dropout to apply inside the model + input_dropout: Dropout to apply on the input features + """ + super().__init__() + self.save_hyperparameters() + self._create_model() + + def _create_model(self): + # Input dim -> Model dim + self.input_net = nn.Sequential( + nn.Dropout(self.hparams.input_dropout), nn.Linear(self.hparams.input_dim, self.hparams.model_dim) + ) + # Positional encoding for sequences + self.positional_encoding = PositionalEncoding(d_model=self.hparams.model_dim) + # Transformer + self.transformer = TransformerEncoder( + num_layers=self.hparams.num_layers, + input_dim=self.hparams.model_dim, + dim_feedforward=2 * self.hparams.model_dim, + num_heads=self.hparams.num_heads, + dropout=self.hparams.dropout, + ) + # Output classifier per sequence lement + self.output_net = nn.Sequential( + nn.Linear(self.hparams.model_dim, self.hparams.model_dim), + nn.LayerNorm(self.hparams.model_dim), + nn.ReLU(inplace=True), + nn.Dropout(self.hparams.dropout), + nn.Linear(self.hparams.model_dim, self.hparams.num_classes), + ) + + def forward(self, x, mask=None, add_positional_encoding=True): + """ + Args: + x: Input features of shape [Batch, SeqLen, input_dim] + mask: Mask to apply on the attention outputs (optional) + add_positional_encoding: If True, we add the positional encoding to the input. + Might not be desired for some tasks. + """ + x = self.input_net(x) + if add_positional_encoding: + x = self.positional_encoding(x) + x = self.transformer(x, mask=mask) + x = self.output_net(x) + return x + + @torch.no_grad() + def get_attention_maps(self, x, mask=None, add_positional_encoding=True): + """Function for extracting the attention matrices of the whole Transformer for a single batch. + + Input arguments same as the forward pass. + """ + x = self.input_net(x) + if add_positional_encoding: + x = self.positional_encoding(x) + attention_maps = self.transformer.get_attention_maps(x, mask=mask) + return attention_maps + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr) + + # We don't return the lr scheduler because we need to apply it per iteration, not per epoch + self.lr_scheduler = CosineWarmupScheduler( + optimizer, warmup=self.hparams.warmup, max_iters=self.hparams.max_iters + ) + return optimizer + + def optimizer_step(self, *args, **kwargs): + super().optimizer_step(*args, **kwargs) + self.lr_scheduler.step() # Step per iteration + + def training_step(self, batch, batch_idx): + raise NotImplementedError + + def validation_step(self, batch, batch_idx): + raise NotImplementedError + + def test_step(self, batch, batch_idx): + raise NotImplementedError + + +# %% [markdown] +# ## Experiments +# +#
+# +# After having finished the implementation of the Transformer architecture, we can start experimenting +# and apply it to various tasks. +# In this notebook, we will focus on two tasks: parallel Sequence-to-Sequence, and set anomaly detection. +# The two tasks focus on different properties of the Transformer architecture, and we go through them below. +# +# ### Sequence to Sequence +# +# A Sequence-to-Sequence task represents a task where the input _and_ the output is a sequence, +# not necessarily of the same length. +# Popular tasks in this domain include machine translation and summarization. +# For this, we usually have a Transformer encoder for interpreting the input sequence, +# and a decoder for generating the output in an autoregressive manner. +# Here, however, we will go back to a much simpler example task and use only the encoder. +# Given a sequence of $N$ numbers between $0$ and $M$, the task is to reverse the input sequence. +# In Numpy notation, if our input is $x$, the output should be $x$[::-1]. +# Although this task sounds very simple, RNNs can have issues with such because the task requires long-term dependencies. +# Transformers are built to support such, and hence, we expect it to perform very well. +# +# First, let's create a dataset class below. + + +# %% +class ReverseDataset(data.Dataset): + def __init__(self, num_categories, seq_len, size): + super().__init__() + self.num_categories = num_categories + self.seq_len = seq_len + self.size = size + + self.data = torch.randint(self.num_categories, size=(self.size, self.seq_len)) + + def __len__(self): + return self.size + + def __getitem__(self, idx): + inp_data = self.data[idx] + labels = torch.flip(inp_data, dims=(0,)) + return inp_data, labels + + +# %% [markdown] +# We create an arbitrary number of random sequences of numbers between 0 and `num_categories-1`. +# The label is simply the tensor flipped over the sequence dimension. +# We can create the corresponding data loaders below. + +# %% +dataset = partial(ReverseDataset, 10, 16) +train_loader = data.DataLoader(dataset(50000), batch_size=128, shuffle=True, drop_last=True, pin_memory=True) +val_loader = data.DataLoader(dataset(1000), batch_size=128) +test_loader = data.DataLoader(dataset(10000), batch_size=128) + +# %% [markdown] +# Let's look at an arbitrary sample of the dataset: + +# %% +inp_data, labels = train_loader.dataset[0] +print("Input data:", inp_data) +print("Labels: ", labels) + +# %% [markdown] +# During training, we pass the input sequence through the Transformer encoder and predict the output for each input token. +# We use the standard Cross-Entropy loss to perform this. +# Every number is represented as a one-hot vector. +# Remember that representing the categories as single scalars decreases the expressiveness of the model extremely +# as $0$ and $1$ are not closer related than $0$ and $9$ in our example. +# An alternative to a one-hot vector is using a learned embedding vector as it is provided by the PyTorch module `nn.Embedding`. +# However, using a one-hot vector with an additional linear layer as in our case has the same effect +# as an embedding layer (`self.input_net` maps one-hot vector to a dense vector, +# where each row of the weight matrix represents the embedding for a specific category). +# +# To implement the training dynamic, we create a new class inheriting from `TransformerPredictor` +# and overwriting the training, validation and test step functions. + + +# %% +class ReversePredictor(TransformerPredictor): + def _calculate_loss(self, batch, mode="train"): + # Fetch data and transform categories to one-hot vectors + inp_data, labels = batch + inp_data = F.one_hot(inp_data, num_classes=self.hparams.num_classes).float() + + # Perform prediction and calculate loss and accuracy + preds = self.forward(inp_data, add_positional_encoding=True) + loss = F.cross_entropy(preds.view(-1, preds.size(-1)), labels.view(-1)) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + # Logging + self.log("%s_loss" % mode, loss) + self.log("%s_acc" % mode, acc) + return loss, acc + + def training_step(self, batch, batch_idx): + loss, _ = self._calculate_loss(batch, mode="train") + return loss + + def validation_step(self, batch, batch_idx): + _ = self._calculate_loss(batch, mode="val") + + def test_step(self, batch, batch_idx): + _ = self._calculate_loss(batch, mode="test") + + +# %% [markdown] +# Finally, we can create a training function similar to the one we have seen in Tutorial 5 for PyTorch Lightning. +# We create a `L.Trainer` object, running for $N$ epochs, logging in TensorBoard, and saving our best model based on the validation. +# Afterward, we test our models on the test set. +# An additional parameter we pass to the trainer here is `gradient_clip_val`. +# This clips the norm of the gradients for all parameters before taking an optimizer step and prevents the model +# from diverging if we obtain very high gradients at, for instance, sharp loss surfaces (see many good blog posts +# on gradient clipping, like [DeepAI glossary](https://deepai.org/machine-learning-glossary-and-terms/gradient-clipping)). +# For Transformers, gradient clipping can help to further stabilize the training during the first few iterations, and also afterward. +# In plain PyTorch, you can apply gradient clipping via `torch.nn.utils.clip_grad_norm_(...)` +# (see [documentation](https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html#torch.nn.utils.clip_grad_norm_)). +# The clip value is usually between 0.5 and 10, depending on how harsh you want to clip large gradients. +# After having explained this, let's implement the training function: + + +# %% +def train_reverse(**kwargs): + # Create a PyTorch Lightning trainer with the generation callback + root_dir = os.path.join(CHECKPOINT_PATH, "ReverseTask") + os.makedirs(root_dir, exist_ok=True) + trainer = L.Trainer( + default_root_dir=root_dir, + callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")], + accelerator="auto", + devices=1, + max_epochs=10, + gradient_clip_val=5, + ) + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "ReverseTask.ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = ReversePredictor.load_from_checkpoint(pretrained_filename) + else: + model = ReversePredictor(max_iters=trainer.max_epochs * len(train_loader), **kwargs) + trainer.fit(model, train_loader, val_loader) + + # Test best model on validation and test set + val_result = trainer.test(model, dataloaders=val_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"test_acc": test_result[0]["test_acc"], "val_acc": val_result[0]["test_acc"]} + + model = model.to(device) + return model, result + + +# %% [markdown] +# Finally, we can train the model. +# In this setup, we will use a single encoder block and a single head in the Multi-Head Attention. +# This is chosen because of the simplicity of the task, and in this case, the attention can actually be interpreted +# as an "explanation" of the predictions (compared to the other papers above dealing with deep Transformers). + +# %% +reverse_model, reverse_result = train_reverse( + input_dim=train_loader.dataset.num_categories, + model_dim=32, + num_heads=1, + num_classes=train_loader.dataset.num_categories, + num_layers=1, + dropout=0.0, + lr=5e-4, + warmup=50, +) + +# %% [markdown] +# The warning of PyTorch Lightning regarding the number of workers can be ignored for now. +# As the data set is so simple and the `__getitem__` finishes a neglectable time, we don't need subprocesses +# to provide us the data (in fact, more workers can slow down the training as we have communication overhead among processes/threads). +# First, let's print the results: + +# %% +print("Val accuracy: %4.2f%%" % (100.0 * reverse_result["val_acc"])) +print("Test accuracy: %4.2f%%" % (100.0 * reverse_result["test_acc"])) + +# %% [markdown] +# As we would have expected, the Transformer can correctly solve the task. +# However, how does the attention in the Multi-Head Attention block looks like for an arbitrary input? +# Let's try to visualize it below. + +# %% +data_input, labels = next(iter(val_loader)) +inp_data = F.one_hot(data_input, num_classes=reverse_model.hparams.num_classes).float() +inp_data = inp_data.to(device) +attention_maps = reverse_model.get_attention_maps(inp_data) + +# %% [markdown] +# The object `attention_maps` is a list of length $N$ where $N$ is the number of layers. +# Each element is a tensor of shape [Batch, Heads, SeqLen, SeqLen], which we can verify below. + +# %% +attention_maps[0].shape + +# %% [markdown] +# Next, we will write a plotting function that takes as input the sequences, attention maps, and an index +# indicating for which batch element we want to visualize the attention map. +# We will create a plot where over rows, we have different layers, while over columns, we show the different heads. +# Remember that the softmax has been applied for each row separately. + + +# %% +def plot_attention_maps(input_data, attn_maps, idx=0): + if input_data is not None: + input_data = input_data[idx].detach().cpu().numpy() + else: + input_data = np.arange(attn_maps[0][idx].shape[-1]) + attn_maps = [m[idx].detach().cpu().numpy() for m in attn_maps] + + num_heads = attn_maps[0].shape[0] + num_layers = len(attn_maps) + seq_len = input_data.shape[0] + fig_size = 4 if num_heads == 1 else 3 + fig, ax = plt.subplots(num_layers, num_heads, figsize=(num_heads * fig_size, num_layers * fig_size)) + if num_layers == 1: + ax = [ax] + if num_heads == 1: + ax = [[a] for a in ax] + for row in range(num_layers): + for column in range(num_heads): + ax[row][column].imshow(attn_maps[row][column], origin="lower", vmin=0) + ax[row][column].set_xticks(list(range(seq_len))) + ax[row][column].set_xticklabels(input_data.tolist()) + ax[row][column].set_yticks(list(range(seq_len))) + ax[row][column].set_yticklabels(input_data.tolist()) + ax[row][column].set_title("Layer %i, Head %i" % (row + 1, column + 1)) + fig.subplots_adjust(hspace=0.5) + plt.show() + + +# %% [markdown] +# Finally, we can plot the attention map of our trained Transformer on the reverse task: + +# %% +plot_attention_maps(data_input, attention_maps, idx=0) + +# %% [markdown] +# The model has learned to attend to the token that is on the flipped index of itself. +# Hence, it actually does what we intended it to do. +# We see that it however also pays some attention to values close to the flipped index. +# This is because the model doesn't need the perfect, hard attention to solve this problem, +# but is fine with this approximate, noisy attention map. +# The close-by indices are caused by the similarity of the positional encoding, +# which we also intended with the positional encoding. + +# %% [markdown] +# ### Set Anomaly Detection +# +# Besides sequences, sets are another data structure that is relevant for many applications. +# In contrast to sequences, elements are unordered in a set. +# RNNs can only be applied on sets by assuming an order in the data, which however biases the model towards +# a non-existing order in the data. +# [Vinyals et al. +# (2015)](https://arxiv.org/abs/1511.06391) and other papers have shown that the assumed order can have a significant +# impact on the model's performance, and hence, we should try to not use RNNs on sets. +# Ideally, our model should be permutation-equivariant/invariant such that the output is the same no matter how we sort the elements in a set. +# +# Transformers offer the perfect architecture for this as the Multi-Head Attention is permutation-equivariant, and thus, +# outputs the same values no matter in what order we enter the inputs (inputs and outputs are permuted equally). +# The task we are looking at for sets is _Set Anomaly Detection_ which means that we try to find the element(s) +# in a set that does not fit the others. +# In the research community, the common application of anomaly detection is performed on a set of images, +# where $N-1$ images belong to the same category/have the same high-level features while one belongs to another category. +# Note that category does not necessarily have to relate to a class in a standard classification problem, +# but could be the combination of multiple features. +# For instance, on a face dataset, this could be people with glasses, male, beard, etc. +# An example of distinguishing different animals can be seen below. +# The first four images show foxes, while the last represents a different animal. +# We want to recognize that the last image shows a different animal, but it is not relevant which class of animal it is. +# +#
+# +# In this tutorial, we will use the CIFAR100 dataset. +# CIFAR100 has 600 images for 100 classes each with a resolution of 32x32, similar to CIFAR10. +# The larger amount of classes requires the model to attend to specific features in the images instead +# of coarse features as in CIFAR10, therefore making the task harder. +# We will show the model a set of 9 images of one class, and 1 image from another class. +# The task is to find the image that is from a different class than the other images. +# Using the raw images directly as input to the Transformer is not a good idea, because it is not translation +# invariant as a CNN, and would need to learn to detect image features from high-dimensional input first of all. +# Instead, we will use a pre-trained ResNet34 model from the torchvision package to obtain high-level, +# low-dimensional features of the images. +# The ResNet model has been pre-trained on the [ImageNet](http://image-net.org/) dataset which contains +# 1 million images of 1k classes and varying resolutions. +# However, during training and testing, the images are usually scaled to a resolution of 224x224, +# and hence we rescale our CIFAR images to this resolution as well. +# Below, we will load the dataset, and prepare the data for being processed by the ResNet model. + +# %% +# ImageNet statistics +DATA_MEANS = np.array([0.485, 0.456, 0.406]) +DATA_STD = np.array([0.229, 0.224, 0.225]) +# As torch tensors for later preprocessing +TORCH_DATA_MEANS = torch.from_numpy(DATA_MEANS).view(1, 3, 1, 1) +TORCH_DATA_STD = torch.from_numpy(DATA_STD).view(1, 3, 1, 1) + +# Resize to 224x224, and normalize to ImageNet statistic +transform = transforms.Compose( + [transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(DATA_MEANS, DATA_STD)] +) +# Loading the training dataset. +train_set = CIFAR100(root=DATASET_PATH, train=True, transform=transform, download=True) + +# Loading the test set +test_set = CIFAR100(root=DATASET_PATH, train=False, transform=transform, download=True) + +# %% [markdown] +# Next, we want to run the pre-trained ResNet model on the images, and extract the features before the classification layer. +# These are the most high-level features, and should sufficiently describe the images. +# CIFAR100 has some similarity to ImageNet, and thus we are not retraining the ResNet model in any form. +# However, if you would want to get the best performance and have a very large dataset, +# it would be better to add the ResNet to the computation graph during training and finetune its parameters as well. +# As we don't have a large enough dataset and want to train our model efficiently, we will extract the features beforehand. +# Let's load and prepare the model below. + +# %% +os.environ["TORCH_HOME"] = CHECKPOINT_PATH +pretrained_model = torchvision.models.resnet34(pretrained=True) +# Remove classification layer +# In some models, it is called "fc", others have "classifier" +# Setting both to an empty sequential represents an identity map of the final features. +pretrained_model.fc = nn.Sequential() +pretrained_model.classifier = nn.Sequential() +# To GPU +pretrained_model = pretrained_model.to(device) + +# Only eval, no gradient required +pretrained_model.eval() +for p in pretrained_model.parameters(): + p.requires_grad = False + +# %% [markdown] +# We will now write a extraction function for the features below. +# This cell requires access to a GPU, as the model is rather deep and the images relatively large. +# The GPUs on GoogleColab are sufficient, but running this cell can take 2-3 minutes. +# Once it is run, the features are exported on disk so they don't have to be recalculated every time you run the notebook. +# However, this requires >150MB free disk space. +# So it is recommended to run this only on a local computer if you have enough free disk and a GPU (GoogleColab is fine for this). +# If you do not have a GPU, you can download the features from the +# [GoogleDrive folder](https://drive.google.com/drive/folders/1DF7POc6j03pRiWQPWSl5QJX5iY-xK0sV?usp=sharing). + + +# %% +@torch.no_grad() +def extract_features(dataset, save_file): + if not os.path.isfile(save_file): + data_loader = data.DataLoader(dataset, batch_size=128, shuffle=False, drop_last=False, num_workers=4) + extracted_features = [] + for imgs, _ in tqdm(data_loader): + imgs = imgs.to(device) + feats = pretrained_model(imgs) + extracted_features.append(feats) + extracted_features = torch.cat(extracted_features, dim=0) + extracted_features = extracted_features.detach().cpu() + torch.save(extracted_features, save_file) + else: + extracted_features = torch.load(save_file) + return extracted_features + + +train_feat_file = os.path.join(CHECKPOINT_PATH, "train_set_features.tar") +train_set_feats = extract_features(train_set, train_feat_file) + +test_feat_file = os.path.join(CHECKPOINT_PATH, "test_set_features.tar") +test_feats = extract_features(test_set, test_feat_file) + +# %% [markdown] +# Let's verify the feature shapes below. +# The training should have 50k elements, and the test 10k images. +# The feature dimension is 512 for the ResNet34. +# If you experiment with other models, you likely see a different feature dimension. + +# %% +print("Train:", train_set_feats.shape) +print("Test: ", test_feats.shape) + +# %% [markdown] +# As usual, we want to create a validation set to detect when we should stop training. +# In this case, we will split the training set into 90% training, 10% validation. +# However, the difficulty is here that we need to ensure that the validation set has the same number of images for all 100 labels. +# Otherwise, we have a class imbalance which is not good for creating the image sets. +# Hence, we take 10% of the images for each class, and move them into the validation set. +# The code below does exactly this. + +# %% +# Split train into train+val +# Get labels from train set +labels = train_set.targets + +# Get indices of images per class +labels = torch.LongTensor(labels) +num_labels = labels.max() + 1 +sorted_indices = torch.argsort(labels).reshape(num_labels, -1) # [classes, num_imgs per class] + +# Determine number of validation images per class +num_val_exmps = sorted_indices.shape[1] // 10 + +# Get image indices for validation and training +val_indices = sorted_indices[:, :num_val_exmps].reshape(-1) +train_indices = sorted_indices[:, num_val_exmps:].reshape(-1) + +# Group corresponding image features and labels +train_feats, train_labels = train_set_feats[train_indices], labels[train_indices] +val_feats, val_labels = train_set_feats[val_indices], labels[val_indices] + +# %% [markdown] +# Now we can prepare a dataset class for the set anomaly task. +# We define an epoch to be the sequence in which each image has been exactly once as an "anomaly". +# Hence, the length of the dataset is the number of images in it. +# For the training set, each time we access an item with `__getitem__`, we sample a random, +# different class than the image at the corresponding index `idx` has. +# In a second step, we sample $N-1$ images of this sampled class. +# The set of 10 images is finally returned. +# The randomness in the `__getitem__` allows us to see a slightly different set during each iteration. +# However, we can't use the same strategy for the test set as we want the test dataset to be the same every time we iterate over it. +# Hence, we sample the sets in the `__init__` method, and return those in `__getitem__`. +# The code below implements exactly this dynamic. + + +# %% +class SetAnomalyDataset(data.Dataset): + def __init__(self, img_feats, labels, set_size=10, train=True): + """ + Args: + img_feats: Tensor of shape [num_imgs, img_dim]. Represents the high-level features. + labels: Tensor of shape [num_imgs], containing the class labels for the images + set_size: Number of elements in a set. N-1 are sampled from one class, and one from another one. + train: If True, a new set will be sampled every time __getitem__ is called. + """ + super().__init__() + self.img_feats = img_feats + self.labels = labels + self.set_size = set_size - 1 # The set size is here the size of correct images + self.train = train + + # Tensors with indices of the images per class + self.num_labels = labels.max() + 1 + self.img_idx_by_label = torch.argsort(self.labels).reshape(self.num_labels, -1) + + if not train: + self.test_sets = self._create_test_sets() + + def _create_test_sets(self): + # Pre-generates the sets for each image for the test set + test_sets = [] + num_imgs = self.img_feats.shape[0] + np.random.seed(42) + test_sets = [self.sample_img_set(self.labels[idx]) for idx in range(num_imgs)] + test_sets = torch.stack(test_sets, dim=0) + return test_sets + + def sample_img_set(self, anomaly_label): + """Samples a new set of images, given the label of the anomaly. + + The sampled images come from a different class than anomaly_label + """ + # Sample class from 0,...,num_classes-1 while skipping anomaly_label as class + set_label = np.random.randint(self.num_labels - 1) + if set_label >= anomaly_label: + set_label += 1 + + # Sample images from the class determined above + img_indices = np.random.choice(self.img_idx_by_label.shape[1], size=self.set_size, replace=False) + img_indices = self.img_idx_by_label[set_label, img_indices] + return img_indices + + def __len__(self): + return self.img_feats.shape[0] + + def __getitem__(self, idx): + anomaly = self.img_feats[idx] + if self.train: # If train => sample + img_indices = self.sample_img_set(self.labels[idx]) + else: # If test => use pre-generated ones + img_indices = self.test_sets[idx] + + # Concatenate images. The anomaly is always the last image for simplicity + img_set = torch.cat([self.img_feats[img_indices], anomaly[None]], dim=0) + indices = torch.cat([img_indices, torch.LongTensor([idx])], dim=0) + label = img_set.shape[0] - 1 + + # We return the indices of the images for visualization purpose. "Label" is the index of the anomaly + return img_set, indices, label + + +# %% [markdown] +# Next, we can setup our datasets and data loaders below. +# Here, we will use a set size of 10, i.e. 9 images from one category + 1 anomaly. +# Feel free to change it if you want to experiment with the sizes. + +# %% +SET_SIZE = 10 +test_labels = torch.LongTensor(test_set.targets) + +train_anom_dataset = SetAnomalyDataset(train_feats, train_labels, set_size=SET_SIZE, train=True) +val_anom_dataset = SetAnomalyDataset(val_feats, val_labels, set_size=SET_SIZE, train=False) +test_anom_dataset = SetAnomalyDataset(test_feats, test_labels, set_size=SET_SIZE, train=False) + +train_anom_loader = data.DataLoader( + train_anom_dataset, batch_size=64, shuffle=True, drop_last=True, num_workers=4, pin_memory=True +) +val_anom_loader = data.DataLoader(val_anom_dataset, batch_size=64, shuffle=False, drop_last=False, num_workers=4) +test_anom_loader = data.DataLoader(test_anom_dataset, batch_size=64, shuffle=False, drop_last=False, num_workers=4) + +# %% [markdown] +# To understand the dataset a little better, we can plot below a few sets from the test dataset. +# Each row shows a different input set, where the first 9 are from the same class. + + +# %% +def visualize_exmp(indices, orig_dataset): + images = [orig_dataset[idx][0] for idx in indices.reshape(-1)] + images = torch.stack(images, dim=0) + images = images * TORCH_DATA_STD + TORCH_DATA_MEANS + + img_grid = torchvision.utils.make_grid(images, nrow=SET_SIZE, normalize=True, pad_value=0.5, padding=16) + img_grid = img_grid.permute(1, 2, 0) + + plt.figure(figsize=(12, 8)) + plt.title("Anomaly examples on CIFAR100") + plt.imshow(img_grid) + plt.axis("off") + plt.show() + plt.close() + + +_, indices, _ = next(iter(test_anom_loader)) +visualize_exmp(indices[:4], test_set) + +# %% [markdown] +# We can already see that for some sets the task might be easier than for others. +# Difficulties can especially arise if the anomaly is in a different, but yet visually similar class +# (e.g. train vs bus, flour vs worm, etc. +# ). +# +# After having prepared the data, we can look closer at the model. +# Here, we have a classification of the whole set. +# For the prediction to be permutation-equivariant, we will output one logit for each image. +# Over these logits, we apply a softmax and train the anomaly image to have the highest score/probability. +# This is a bit different than a standard classification layer as the softmax is applied over images, +# not over output classes in the classical sense. +# However, if we swap two images in their position, we effectively swap their position in the output softmax. +# Hence, the prediction is equivariant with respect to the input. +# We implement this idea below in the subclass of the Transformer Lightning module. + + +# %% +class AnomalyPredictor(TransformerPredictor): + def _calculate_loss(self, batch, mode="train"): + img_sets, _, labels = batch + # No positional encodings as it is a set, not a sequence! + preds = self.forward(img_sets, add_positional_encoding=False) + preds = preds.squeeze(dim=-1) # Shape: [Batch_size, set_size] + loss = F.cross_entropy(preds, labels) # Softmax/CE over set dimension + acc = (preds.argmax(dim=-1) == labels).float().mean() + self.log("%s_loss" % mode, loss) + self.log("%s_acc" % mode, acc, on_step=False, on_epoch=True) + return loss, acc + + def training_step(self, batch, batch_idx): + loss, _ = self._calculate_loss(batch, mode="train") + return loss + + def validation_step(self, batch, batch_idx): + _ = self._calculate_loss(batch, mode="val") + + def test_step(self, batch, batch_idx): + _ = self._calculate_loss(batch, mode="test") + + +# %% [markdown] +# Finally, we write our train function below. +# It has the exact same structure as the reverse task one, hence not much of an explanation is needed here. + + +# %% +def train_anomaly(**kwargs): + # Create a PyTorch Lightning trainer with the generation callback + root_dir = os.path.join(CHECKPOINT_PATH, "SetAnomalyTask") + os.makedirs(root_dir, exist_ok=True) + trainer = L.Trainer( + default_root_dir=root_dir, + callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")], + accelerator="auto", + devices=1, + max_epochs=100, + gradient_clip_val=2, + ) + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "SetAnomalyTask.ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = AnomalyPredictor.load_from_checkpoint(pretrained_filename) + else: + model = AnomalyPredictor(max_iters=trainer.max_epochs * len(train_anom_loader), **kwargs) + trainer.fit(model, train_anom_loader, val_anom_loader) + model = AnomalyPredictor.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # Test best model on validation and test set + train_result = trainer.test(model, dataloaders=train_anom_loader, verbose=False) + val_result = trainer.test(model, dataloaders=val_anom_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_anom_loader, verbose=False) + result = { + "test_acc": test_result[0]["test_acc"], + "val_acc": val_result[0]["test_acc"], + "train_acc": train_result[0]["test_acc"], + } + + model = model.to(device) + return model, result + + +# %% [markdown] +# Let's finally train our model. +# We will use 4 layers with 4 attention heads each. +# The hidden dimensionality of the model is 256, and we use a dropout of 0.1 throughout the model for good regularization. +# Note that we also apply the dropout on the input features, as this makes the model more robust against +# image noise and generalizes better. +# Again, we use warmup to slowly start our model training. + +# %% +anomaly_model, anomaly_result = train_anomaly( + input_dim=train_anom_dataset.img_feats.shape[-1], + model_dim=256, + num_heads=4, + num_classes=1, + num_layers=4, + dropout=0.1, + input_dropout=0.1, + lr=5e-4, + warmup=100, +) + +# %% [markdown] +# We can print the achieved accuracy below. + +# %% +print("Train accuracy: %4.2f%%" % (100.0 * anomaly_result["train_acc"])) +print("Val accuracy: %4.2f%%" % (100.0 * anomaly_result["val_acc"])) +print("Test accuracy: %4.2f%%" % (100.0 * anomaly_result["test_acc"])) + +# %% [markdown] +# With ~94% validation and test accuracy, the model generalizes quite well. +# It should be noted that you might see slightly different scores depending on what computer/device you are running this notebook. +# This is because despite setting the seed before generating the test dataset, it is not the same across platforms and numpy versions. +# Nevertheless, we can conclude that the model performs quite well and can solve the task for most sets. +# Before trying to interpret the model, let's verify that our model is permutation-equivariant, +# and assigns the same predictions for different permutations of the input set. +# For this, we sample a batch from the test set and run it through the model to obtain the probabilities. + +# %% +inp_data, indices, labels = next(iter(test_anom_loader)) +inp_data = inp_data.to(device) + +anomaly_model.eval() + +with torch.no_grad(): + preds = anomaly_model.forward(inp_data, add_positional_encoding=False) + preds = F.softmax(preds.squeeze(dim=-1), dim=-1) + + # Permut input data + permut = np.random.permutation(inp_data.shape[1]) + perm_inp_data = inp_data[:, permut] + perm_preds = anomaly_model.forward(perm_inp_data, add_positional_encoding=False) + perm_preds = F.softmax(perm_preds.squeeze(dim=-1), dim=-1) + +assert (preds[:, permut] - perm_preds).abs().max() < 1e-5, "Predictions are not permutation equivariant" + +print("Preds\n", preds[0, permut].cpu().numpy()) +print("Permuted preds\n", perm_preds[0].cpu().numpy()) + +# %% [markdown] +# You can see that the predictions are almost exactly the same, and only differ because of slight numerical +# differences inside the network operation. +# +# To interpret the model a little more, we can plot the attention maps inside the model. +# This will give us an idea of what information the model is sharing/communicating between images, +# and what each head might represent. +# First, we need to extract the attention maps for the test batch above, and determine the discrete predictions for simplicity. + +# %% +attention_maps = anomaly_model.get_attention_maps(inp_data, add_positional_encoding=False) +predictions = preds.argmax(dim=-1) + +# %% [markdown] +# Below we write a plot function which plots the images in the input set, the prediction of the model, +# and the attention maps of the different heads on layers of the transformer. +# Feel free to explore the attention maps for different input examples as well. + + +# %% +def visualize_prediction(idx): + visualize_exmp(indices[idx : idx + 1], test_set) + print("Prediction:", predictions[idx].item()) + plot_attention_maps(input_data=None, attn_maps=attention_maps, idx=idx) + + +visualize_prediction(0) + +# %% [markdown] +# Depending on the random seed, you might see a slightly different input set. +# For the version on the website, we compare 9 tree images with a volcano. +# We see that multiple heads, for instance, Layer 2 Head 1, Layer 2 Head 3, and Layer 3 Head 1 focus on the last image. +# Additionally, the heads in Layer 4 all seem to ignore the last image and assign a very low attention probability to it. +# This shows that the model has indeed recognized that the image doesn't fit the setting, and hence predicted it to be the anomaly. +# Layer 3 Head 2-4 seems to take a slightly weighted average of all images. +# That might indicate that the model extracts the "average" information of all images, to compare it to the image features itself. +# +# Let's try to find where the model actually makes a mistake. +# We can do this by identifying the sets where the model predicts something else than 9, as in the dataset, +# we ensured that the anomaly is always at the last position in the set. + +# %% +mistakes = torch.where(predictions != 9)[0].cpu().numpy() +print("Indices with mistake:", mistakes) + +# %% [markdown] +# As our model achieves ~94% accuracy, we only have very little number of mistakes in a batch of 64 sets. +# Still, let's visualize one of them, for example the last one: + +# %% +visualize_prediction(mistakes[-1]) +print("Probabilities:") +for i, p in enumerate(preds[mistakes[-1]].cpu().numpy()): + print("Image %i: %4.2f%%" % (i, 100.0 * p)) + +# %% [markdown] +# In this example, the model confuses a palm tree with a building, giving a probability of ~90% to image 2, and 8% to the actual anomaly. +# However, the difficulty here is that the picture of the building has been taken at a similar angle as the palms. +# Meanwhile, image 2 shows a rather unusual palm with a different color palette, which is why the model fails here. +# Nevertheless, in general, the model performs quite well. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we took a closer look at the Multi-Head Attention layer which uses a scaled dot product between +# queries and keys to find correlations and similarities between input elements. +# The Transformer architecture is based on the Multi-Head Attention layer and applies multiple of them in a ResNet-like block. +# The Transformer is a very important, recent architecture that can be applied to many tasks and datasets. +# Although it is best known for its success in NLP, there is so much more to it. +# We have seen its application on sequence-to-sequence tasks and set anomaly detection. +# Its property of being permutation-equivariant if we do not provide any positional encodings, allows it to generalize to many settings. +# Hence, it is important to know the architecture, but also its possible issues such as the gradient problem during +# the first iterations solved by learning rate warm-up. +# If you are interested in continuing with the study of the Transformer architecture, +# please have a look at the blog posts listed at the beginning of the tutorial notebook. diff --git a/course_UvA-DL/05-transformers-and-MH-attention/attention_example.svg b/course_UvA-DL/05-transformers-and-MH-attention/attention_example.svg new file mode 100644 index 0000000..bd290ad --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/attention_example.svg @@ -0,0 +1 @@ + diff --git a/course_UvA-DL/05-transformers-and-MH-attention/cifar100_example_anomaly.png b/course_UvA-DL/05-transformers-and-MH-attention/cifar100_example_anomaly.png new file mode 100644 index 0000000..7e06e5a Binary files /dev/null and b/course_UvA-DL/05-transformers-and-MH-attention/cifar100_example_anomaly.png differ diff --git a/course_UvA-DL/05-transformers-and-MH-attention/comparison_conv_rnn.svg b/course_UvA-DL/05-transformers-and-MH-attention/comparison_conv_rnn.svg new file mode 100644 index 0000000..7af315f --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/comparison_conv_rnn.svg @@ -0,0 +1,1803 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/05-transformers-and-MH-attention/multihead_attention.svg b/course_UvA-DL/05-transformers-and-MH-attention/multihead_attention.svg new file mode 100644 index 0000000..1019553 --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/multihead_attention.svg @@ -0,0 +1,282 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/05-transformers-and-MH-attention/scaled_dot_product_attn.svg b/course_UvA-DL/05-transformers-and-MH-attention/scaled_dot_product_attn.svg new file mode 100644 index 0000000..7ca74ea --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/scaled_dot_product_attn.svg @@ -0,0 +1,346 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/05-transformers-and-MH-attention/transformer_architecture.svg b/course_UvA-DL/05-transformers-and-MH-attention/transformer_architecture.svg new file mode 100644 index 0000000..5b6b0f4 --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/transformer_architecture.svg @@ -0,0 +1,112 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + Encoder + Decoder + + diff --git a/course_UvA-DL/05-transformers-and-MH-attention/warmup_loss_plot.svg b/course_UvA-DL/05-transformers-and-MH-attention/warmup_loss_plot.svg new file mode 100644 index 0000000..e38f81c --- /dev/null +++ b/course_UvA-DL/05-transformers-and-MH-attention/warmup_loss_plot.svg @@ -0,0 +1,1573 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/06-graph-neural-networks/.meta.yml b/course_UvA-DL/06-graph-neural-networks/.meta.yml new file mode 100644 index 0000000..cfd63af --- /dev/null +++ b/course_UvA-DL/06-graph-neural-networks/.meta.yml @@ -0,0 +1,31 @@ +title: "Tutorial 6: Basics of Graph Neural Networks" +author: Phillip Lippe +created: 2021-06-07 +updated: 2023-03-14 +license: CC BY-SA +build: 0 +tags: + - Graph +description: | + In this tutorial, we will discuss the application of neural networks on graphs. + Graph Neural Networks (GNNs) have recently gained increasing popularity in both applications and research, + including domains such as social networks, knowledge graphs, recommender systems, and bioinformatics. + While the theory and math behind GNNs might first seem complicated, + the implementation of those models is quite simple and helps in understanding the methodology. + Therefore, we will discuss the implementation of basic network layers of a GNN, + namely graph convolutions, and attention layers. + Finally, we will apply a GNN on semi-supervised node classification and molecule categorization. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torch-scatter + - torch-sparse + - torch-cluster + - torch-spline-conv + - torch-geometric + - lightning>=2.0.0rc0 +pip__find-link: + # - https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html + - https://pytorch-geometric.com/whl/torch-%(TORCH_MAJOR_DOT_MINOR)s.0+%(DEVICE)s.html +accelerator: + - GPU diff --git a/course_UvA-DL/06-graph-neural-networks/.thumb.jpg b/course_UvA-DL/06-graph-neural-networks/.thumb.jpg new file mode 100644 index 0000000..0cda6bd Binary files /dev/null and b/course_UvA-DL/06-graph-neural-networks/.thumb.jpg differ diff --git a/course_UvA-DL/06-graph-neural-networks/GNN_overview.py b/course_UvA-DL/06-graph-neural-networks/GNN_overview.py new file mode 100644 index 0000000..1e69357 --- /dev/null +++ b/course_UvA-DL/06-graph-neural-networks/GNN_overview.py @@ -0,0 +1,1001 @@ +# %% [markdown] +#
+# We start by importing our standard libraries below. + +# %% +# Standard libraries +import os + +# For downloading pre-trained models +import urllib.request +from urllib.error import HTTPError + +# PyTorch Lightning +import lightning as L + +# PyTorch +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +# PyTorch geometric +import torch_geometric +import torch_geometric.data as geom_data +import torch_geometric.nn as geom_nn + +# PL callbacks +from lightning.pytorch.callbacks import ModelCheckpoint +from torch import Tensor + +AVAIL_GPUS = min(1, torch.cuda.device_count()) +BATCH_SIZE = 256 if AVAIL_GPUS else 64 +# Path to the folder where the datasets are/should be downloaded +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/GNNs/") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# %% [markdown] +# We also have a few pre-trained models we download below. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial7/" +# Files to download +pretrained_files = ["NodeLevelMLP.ckpt", "NodeLevelGNN.ckpt", "GraphLevelGraphConv.ckpt"] + +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if "/" in file_name: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder," + " or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## Graph Neural Networks + +# %% [markdown] +# ### Graph representation +# +# Before starting the discussion of specific neural network operations on graphs, we should consider how to represent a graph. +# Mathematically, a graph $\mathcal{G}$ is defined as a tuple of a set of nodes/vertices $V$, and a set of edges/links $E$: $\mathcal{G}=(V,E)$. +# Each edge is a pair of two vertices, and represents a connection between them. +# For instance, let's look at the following graph: +# +#
+# +# The vertices are $V=\{1,2,3,4\}$, and edges $E=\{(1,2), (2,3), (2,4), (3,4)\}$. +# Note that for simplicity, we assume the graph to be undirected and hence don't add mirrored pairs like $(2,1)$. +# In application, vertices and edge can often have specific attributes, and edges can even be directed. +# The question is how we could represent this diversity in an efficient way for matrix operations. +# Usually, for the edges, we decide between two variants: an adjacency matrix, or a list of paired vertex indices. +# +# The **adjacency matrix** $A$ is a square matrix whose elements indicate whether pairs of vertices are adjacent, +# i.e. connected, or not. +# In the simplest case, $A_{ij}$ is 1 if there is a connection from node $i$ to $j$, and otherwise 0. +# If we have edge attributes or different categories of edges in a graph, this information can be added to the matrix as well. +# For an undirected graph, keep in mind that $A$ is a symmetric matrix ($A_{ij}=A_{ji}$). +# For the example graph above, we have the following adjacency matrix: +# +# $$ +# A = \begin{bmatrix} +# 0 & 1 & 0 & 0\\ +# 1 & 0 & 1 & 1\\ +# 0 & 1 & 0 & 1\\ +# 0 & 1 & 1 & 0 +# \end{bmatrix} +# $$ +# +# While expressing a graph as a list of edges is more efficient in terms of memory and (possibly) computation, +# using an adjacency matrix is more intuitive and simpler to implement. +# In our implementations below, we will rely on the adjacency matrix to keep the code simple. +# However, common libraries use edge lists, which we will discuss later more. +# Alternatively, we could also use the list of edges to define a sparse adjacency matrix with which we can work +# as if it was a dense matrix, but allows more memory-efficient operations. +# PyTorch supports this with the sub-package `torch.sparse` +# ([documentation](https://pytorch.org/docs/stable/sparse.html)) which is however still in a beta-stage +# (API might change in future). + +# %% [markdown] +# ### Graph Convolutions +# +# Graph Convolutional Networks have been introduced by [Kipf et al. ](https://openreview.net/pdf?id=SJU4ayYgl) +# in 2016 at the University of Amsterdam. +# He also wrote a great [blog post](https://tkipf.github.io/graph-convolutional-networks/) about this topic, +# which is recommended if you want to read about GCNs from a different perspective. +# GCNs are similar to convolutions in images in the sense that the "filter" parameters are typically shared over all locations in the graph. +# At the same time, GCNs rely on message passing methods, which means that vertices exchange information with the neighbors, +# and send "messages" to each other. +# Before looking at the math, we can try to visually understand how GCNs work. +# The first step is that each node creates a feature vector that represents the message it wants to send to all its neighbors. +# In the second step, the messages are sent to the neighbors, so that a node receives one message per adjacent node. +# Below we have visualized the two steps for our example graph. +# +#
+# +# If we want to formulate that in more mathematical terms, we need to first decide how to combine +# all the messages a node receives. +# As the number of messages vary across nodes, we need an operation that works for any number. +# Hence, the usual way to go is to sum or take the mean. +# Given the previous features of nodes $H^{(l)}$, the GCN layer is defined as follows: +# +# $$H^{(l+1)} = \sigma\left(\hat{D}^{-1/2}\hat{A}\hat{D}^{-1/2}H^{(l)}W^{(l)}\right)$$ +# +# $W^{(l)}$ is the weight parameters with which we transform the input features into messages ($H^{(l)}W^{(l)}$). +# To the adjacency matrix $A$ we add the identity matrix so that each node sends its own message also to itself: +# $\hat{A}=A+I$. +# Finally, to take the average instead of summing, we calculate the matrix $\hat{D}$ which is a diagonal +# matrix with $D_{ii}$ denoting the number of neighbors node $i$ has. +# $\sigma$ represents an arbitrary activation function, and not necessarily the sigmoid (usually a ReLU-based +# activation function is used in GNNs). +# +# When implementing the GCN layer in PyTorch, we can take advantage of the flexible operations on tensors. +# Instead of defining a matrix $\hat{D}$, we can simply divide the summed messages by the number of neighbors afterward. +# Additionally, we replace the weight matrix with a linear layer, which additionally allows us to add a bias. +# Written as a PyTorch module, the GCN layer is defined as follows: + + +# %% +class GCNLayer(nn.Module): + def __init__(self, c_in, c_out): + super().__init__() + self.projection = nn.Linear(c_in, c_out) + + def forward(self, node_feats, adj_matrix): + """Forward. + + Args: + node_feats: Tensor with node features of shape [batch_size, num_nodes, c_in] + adj_matrix: Batch of adjacency matrices of the graph. If there is an edge from i to j, + adj_matrix[b,i,j]=1 else 0. Supports directed edges by non-symmetric matrices. + Assumes to already have added the identity connections. + Shape: [batch_size, num_nodes, num_nodes] + """ + # Num neighbours = number of incoming edges + num_neighbours = adj_matrix.sum(dim=-1, keepdims=True) + node_feats = self.projection(node_feats) + node_feats = torch.bmm(adj_matrix, node_feats) + node_feats = node_feats / num_neighbours + return node_feats + + +# %% [markdown] +# To further understand the GCN layer, we can apply it to our example graph above. +# First, let's specify some node features and the adjacency matrix with added self-connections: + +# %% +node_feats = torch.arange(8, dtype=torch.float32).view(1, 4, 2) +adj_matrix = Tensor([[[1, 1, 0, 0], [1, 1, 1, 1], [0, 1, 1, 1], [0, 1, 1, 1]]]) + +print("Node features:\n", node_feats) +print("\nAdjacency matrix:\n", adj_matrix) + +# %% [markdown] +# Next, let's apply a GCN layer to it. +# For simplicity, we initialize the linear weight matrix as an identity matrix so that the input features are equal to the messages. +# This makes it easier for us to verify the message passing operation. + +# %% +layer = GCNLayer(c_in=2, c_out=2) +layer.projection.weight.data = Tensor([[1.0, 0.0], [0.0, 1.0]]) +layer.projection.bias.data = Tensor([0.0, 0.0]) + +with torch.no_grad(): + out_feats = layer(node_feats, adj_matrix) + +print("Adjacency matrix", adj_matrix) +print("Input features", node_feats) +print("Output features", out_feats) + +# %% [markdown] +# As we can see, the first node's output values are the average of itself and the second node. +# Similarly, we can verify all other nodes. +# However, in a GNN, we would also want to allow feature exchange between nodes beyond its neighbors. +# This can be achieved by applying multiple GCN layers, which gives us the final layout of a GNN. +# The GNN can be build up by a sequence of GCN layers and non-linearities such as ReLU. +# For a visualization, see below (figure credit - [Thomas Kipf, 2016](https://tkipf.github.io/graph-convolutional-networks/)). +# +#
+# +# However, one issue we can see from looking at the example above is that the output features for nodes 3 and 4 are +# the same because they have the same adjacent nodes (including itself). +# Therefore, GCN layers can make the network forget node-specific information if we just take a mean over all messages. +# Multiple possible improvements have been proposed. +# While the simplest option might be using residual connections, the more common approach is to either weigh +# the self-connections higher or define a separate weight matrix for the self-connections. +# Alternatively, we can use a well-known concept: attention. + +# %% [markdown] +# ### Graph Attention +# +# Attention describes a weighted average of multiple elements with the weights dynamically computed based on an input +# query and elements' keys (if you don't know what attention is, it is recommended to at least go through +# the very first section called [What is Attention?](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html#What-is-Attention?)). +# This concept can be similarly applied to graphs, one of such is the Graph Attention Network +# (called GAT, proposed by [Velickovic et al., 2017](https://arxiv.org/abs/1710.10903)). +# Similarly to the GCN, the graph attention layer creates a message for each node using a linear layer/weight matrix. +# For the attention part, it uses the message from the node itself as a query, and the messages to average as both +# keys and values (note that this also includes the message to itself). +# The score function $f_{attn}$ is implemented as a one-layer MLP which maps the query and key to a single value. +# The MLP looks as follows (figure credit - [Velickovic et al. ](https://arxiv.org/abs/1710.10903)): +# +#
+# +# $h_i$ and $h_j$ are the original features from node $i$ and $j$ respectively, and represent the messages +# of the layer with $\mathbf{W}$ as weight matrix. +# $\mathbf{a}$ is the weight matrix of the MLP, which has the shape $[1,2\times d_{\text{message}}]$, +# and $\alpha_{ij}$ the final attention weight from node $i$ to $j$. +# The calculation can be described as follows: +# +# $$\alpha_{ij} = \frac{\exp\left(\text{LeakyReLU}\left(\mathbf{a}\left[\mathbf{W}h_i||\mathbf{W}h_j\right]\right)\right)}{\sum_{k\in\mathcal{N}_i} \exp\left(\text{LeakyReLU}\left(\mathbf{a}\left[\mathbf{W}h_i||\mathbf{W}h_k\right]\right)\right)}$$ +# +# The operator $||$ represents the concatenation, and $\mathcal{N}_i$ the indices of the neighbors of node $i$. +# Note that in contrast to usual practice, we apply a non-linearity (here LeakyReLU) before the softmax over elements. +# Although it seems like a minor change at first, it is crucial for the attention to depend on the original input. +# Specifically, let's remove the non-linearity for a second, and try to simplify the expression: +# +# $$ +# \begin{split} +# \alpha_{ij} & = \frac{\exp\left(\mathbf{a}\left[\mathbf{W}h_i||\mathbf{W}h_j\right]\right)}{\sum_{k\in\mathcal{N}_i} \exp\left(\mathbf{a}\left[\mathbf{W}h_i||\mathbf{W}h_k\right]\right)}\\[5pt] +# & = \frac{\exp\left(\mathbf{a}_{:,:d/2}\mathbf{W}h_i+\mathbf{a}_{:,d/2:}\mathbf{W}h_j\right)}{\sum_{k\in\mathcal{N}_i} \exp\left(\mathbf{a}_{:,:d/2}\mathbf{W}h_i+\mathbf{a}_{:,d/2:}\mathbf{W}h_k\right)}\\[5pt] +# & = \frac{\exp\left(\mathbf{a}_{:,:d/2}\mathbf{W}h_i\right)\cdot\exp\left(\mathbf{a}_{:,d/2:}\mathbf{W}h_j\right)}{\sum_{k\in\mathcal{N}_i} \exp\left(\mathbf{a}_{:,:d/2}\mathbf{W}h_i\right)\cdot\exp\left(\mathbf{a}_{:,d/2:}\mathbf{W}h_k\right)}\\[5pt] +# & = \frac{\exp\left(\mathbf{a}_{:,d/2:}\mathbf{W}h_j\right)}{\sum_{k\in\mathcal{N}_i} \exp\left(\mathbf{a}_{:,d/2:}\mathbf{W}h_k\right)}\\ +# \end{split} +# $$ +# +# We can see that without the non-linearity, the attention term with $h_i$ actually cancels itself out, +# resulting in the attention being independent of the node itself. +# Hence, we would have the same issue as the GCN of creating the same output features for nodes with the same neighbors. +# This is why the LeakyReLU is crucial and adds some dependency on $h_i$ to the attention. +# +# Once we obtain all attention factors, we can calculate the output features for each node by performing +# the weighted average: +# +# $$h_i'=\sigma\left(\sum_{j\in\mathcal{N}_i}\alpha_{ij}\mathbf{W}h_j\right)$$ +# +# $\sigma$ is yet another non-linearity, as in the GCN layer. +# Visually, we can represent the full message passing in an attention layer as follows +# (figure credit - [Velickovic et al. ](https://arxiv.org/abs/1710.10903)): +# +#
+# +# To increase the expressiveness of the graph attention network, [Velickovic et al. ](https://arxiv.org/abs/1710.10903) +# proposed to extend it to multiple heads similar to the Multi-Head Attention block in Transformers. +# This results in $N$ attention layers being applied in parallel. +# In the image above, it is visualized as three different colors of arrows (green, blue, and purple) +# that are afterward concatenated. +# The average is only applied for the very final prediction layer in a network. +# +# After having discussed the graph attention layer in detail, we can implement it below: + + +# %% +class GATLayer(nn.Module): + def __init__(self, c_in, c_out, num_heads=1, concat_heads=True, alpha=0.2): + """ + Args: + c_in: Dimensionality of input features + c_out: Dimensionality of output features + num_heads: Number of heads, i.e. attention mechanisms to apply in parallel. The + output features are equally split up over the heads if concat_heads=True. + concat_heads: If True, the output of the different heads is concatenated instead of averaged. + alpha: Negative slope of the LeakyReLU activation. + """ + super().__init__() + self.num_heads = num_heads + self.concat_heads = concat_heads + if self.concat_heads: + assert c_out % num_heads == 0, "Number of output features must be a multiple of the count of heads." + c_out = c_out // num_heads + + # Sub-modules and parameters needed in the layer + self.projection = nn.Linear(c_in, c_out * num_heads) + self.a = nn.Parameter(Tensor(num_heads, 2 * c_out)) # One per head + self.leakyrelu = nn.LeakyReLU(alpha) + + # Initialization from the original implementation + nn.init.xavier_uniform_(self.projection.weight.data, gain=1.414) + nn.init.xavier_uniform_(self.a.data, gain=1.414) + + def forward(self, node_feats, adj_matrix, print_attn_probs=False): + """Forward. + + Args: + node_feats: Input features of the node. Shape: [batch_size, c_in] + adj_matrix: Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes] + print_attn_probs: If True, the attention weights are printed during the forward pass + (for debugging purposes) + """ + batch_size, num_nodes = node_feats.size(0), node_feats.size(1) + + # Apply linear layer and sort nodes by head + node_feats = self.projection(node_feats) + node_feats = node_feats.view(batch_size, num_nodes, self.num_heads, -1) + + # We need to calculate the attention logits for every edge in the adjacency matrix + # Doing this on all possible combinations of nodes is very expensive + # => Create a tensor of [W*h_i||W*h_j] with i and j being the indices of all edges + # Returns indices where the adjacency matrix is not 0 => edges + edges = adj_matrix.nonzero(as_tuple=False) + node_feats_flat = node_feats.view(batch_size * num_nodes, self.num_heads, -1) + edge_indices_row = edges[:, 0] * num_nodes + edges[:, 1] + edge_indices_col = edges[:, 0] * num_nodes + edges[:, 2] + a_input = torch.cat( + [ + torch.index_select(input=node_feats_flat, index=edge_indices_row, dim=0), + torch.index_select(input=node_feats_flat, index=edge_indices_col, dim=0), + ], + dim=-1, + ) # Index select returns a tensor with node_feats_flat being indexed at the desired positions + + # Calculate attention MLP output (independent for each head) + attn_logits = torch.einsum("bhc,hc->bh", a_input, self.a) + attn_logits = self.leakyrelu(attn_logits) + + # Map list of attention values back into a matrix + attn_matrix = attn_logits.new_zeros(adj_matrix.shape + (self.num_heads,)).fill_(-9e15) + attn_matrix[adj_matrix[..., None].repeat(1, 1, 1, self.num_heads) == 1] = attn_logits.reshape(-1) + + # Weighted average of attention + attn_probs = F.softmax(attn_matrix, dim=2) + if print_attn_probs: + print("Attention probs\n", attn_probs.permute(0, 3, 1, 2)) + node_feats = torch.einsum("bijh,bjhc->bihc", attn_probs, node_feats) + + # If heads should be concatenated, we can do this by reshaping. Otherwise, take mean + if self.concat_heads: + node_feats = node_feats.reshape(batch_size, num_nodes, -1) + else: + node_feats = node_feats.mean(dim=2) + + return node_feats + + +# %% [markdown] +# Again, we can apply the graph attention layer on our example graph above to understand the dynamics better. +# As before, the input layer is initialized as an identity matrix, but we set $\mathbf{a}$ +# to be a vector of arbitrary numbers to obtain different attention values. +# We use two heads to show the parallel, independent attention mechanisms working in the layer. + +# %% +layer = GATLayer(2, 2, num_heads=2) +layer.projection.weight.data = Tensor([[1.0, 0.0], [0.0, 1.0]]) +layer.projection.bias.data = Tensor([0.0, 0.0]) +layer.a.data = Tensor([[-0.2, 0.3], [0.1, -0.1]]) + +with torch.no_grad(): + out_feats = layer(node_feats, adj_matrix, print_attn_probs=True) + +print("Adjacency matrix", adj_matrix) +print("Input features", node_feats) +print("Output features", out_feats) + +# %% [markdown] +# We recommend that you try to calculate the attention matrix at least for one head and one node for yourself. +# The entries are 0 where there does not exist an edge between $i$ and $j$. +# For the others, we see a diverse set of attention probabilities. +# Moreover, the output features of node 3 and 4 are now different although they have the same neighbors. + +# %% [markdown] +# ## PyTorch Geometric +# +# We had mentioned before that implementing graph networks with adjacency matrix is simple and straight-forward +# but can be computationally expensive for large graphs. +# Many real-world graphs can reach over 200k nodes, for which adjacency matrix-based implementations fail. +# There are a lot of optimizations possible when implementing GNNs, and luckily, there exist packages that provide such layers. +# The most popular packages for PyTorch are [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) +# and the [Deep Graph Library](https://www.dgl.ai/) (the latter being actually framework agnostic). +# Which one to use depends on the project you are planning to do and personal taste. +# In this tutorial, we will look at PyTorch Geometric as part of the PyTorch family. +# +# PyTorch Geometric provides us a set of common graph layers, including the GCN and GAT layer we implemented above. +# Additionally, similar to PyTorch's torchvision, it provides the common graph datasets and transformations +# on those to simplify training. +# Compared to our implementation above, PyTorch Geometric uses a list of index pairs to represent the edges. +# The details of this library will be explored further in our experiments. +# +# In our tasks below, we want to allow us to pick from a multitude of graph layers. +# Thus, we define again below a dictionary to access those using a string: + +# %% +gnn_layer_by_name = {"GCN": geom_nn.GCNConv, "GAT": geom_nn.GATConv, "GraphConv": geom_nn.GraphConv} + +# %% [markdown] +# Additionally to GCN and GAT, we added the layer `geom_nn.GraphConv` +# ([documentation](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GraphConv)). +# GraphConv is a GCN with a separate weight matrix for the self-connections. +# Mathematically, this would be: +# +# $$ +# \mathbf{x}_i^{(l+1)} = \mathbf{W}^{(l + 1)}_1 \mathbf{x}_i^{(l)} + \mathbf{W}^{(\ell + 1)}_2 \sum_{j \in \mathcal{N}_i} \mathbf{x}_j^{(l)} +# $$ +# +# In this formula, the neighbor's messages are added instead of averaged. +# However, PyTorch Geometric provides the argument `aggr` to switch between summing, averaging, and max pooling. + +# %% [markdown] +# ## Experiments on graph structures +# +#
+# +# Tasks on graph-structured data can be grouped into three groups: node-level, edge-level and graph-level. +# The different levels describe on which level we want to perform classification/regression. +# We will discuss all three types in more detail below. + +# %% [markdown] +# ### Node-level tasks: Semi-supervised node classification +# +# Node-level tasks have the goal to classify nodes in a graph. +# Usually, we have given a single, large graph with >1000 nodes of which a certain amount of nodes are labeled. +# We learn to classify those labeled examples during training and try to generalize to the unlabeled nodes. +# +# A popular example that we will use in this tutorial is the Cora dataset, a citation network among papers. +# The Cora consists of 2708 scientific publications with links between each other representing +# the citation of one paper by another. +# The task is to classify each publication into one of seven classes. +# Each publication is represented by a bag-of-words vector. +# This means that we have a vector of 1433 elements for each publication, where a 1 at feature $i$ indicates +# that the $i$-th word of a pre-defined dictionary is in the article. +# Binary bag-of-words representations are commonly used when we need very simple encodings, +# and already have an intuition of what words to expect in a network. +# There exist much better approaches, but we will leave this to the NLP courses to discuss. +# +# We will load the dataset below: + +# %% +cora_dataset = torch_geometric.datasets.Planetoid(root=DATASET_PATH, name="Cora") + +# %% [markdown] +# Let's look at how PyTorch Geometric represents the graph data. +# Note that although we have a single graph, PyTorch Geometric returns a dataset for compatibility to other datasets. + +# %% +cora_dataset[0] + +# %% [markdown] +# The graph is represented by a `Data` object +# ([documentation](https://pytorch-geometric.readthedocs.io/en/latest/modules/data.html#torch_geometric.data.Data)) +# which we can access as a standard Python namespace. +# The edge index tensor is the list of edges in the graph and contains the mirrored version of each edge for undirected graphs. +# The `train_mask`, `val_mask`, and `test_mask` are boolean masks that indicate which nodes we should use for training, +# validation, and testing. +# The `x` tensor is the feature tensor of our 2708 publications, and `y` the labels for all nodes. +# +# After having seen the data, we can implement a simple graph neural network. +# The GNN applies a sequence of graph layers (GCN, GAT, or GraphConv), ReLU as activation function, +# and dropout for regularization. +# See below for the specific implementation. + + +# %% +class GNNModel(nn.Module): + def __init__( + self, + c_in, + c_hidden, + c_out, + num_layers=2, + layer_name="GCN", + dp_rate=0.1, + **kwargs, + ): + """GNNModel. + + Args: + c_in: Dimension of input features + c_hidden: Dimension of hidden features + c_out: Dimension of the output features. Usually number of classes in classification + num_layers: Number of "hidden" graph layers + layer_name: String of the graph layer to use + dp_rate: Dropout rate to apply throughout the network + kwargs: Additional arguments for the graph layer (e.g. number of heads for GAT) + """ + super().__init__() + gnn_layer = gnn_layer_by_name[layer_name] + + layers = [] + in_channels, out_channels = c_in, c_hidden + for l_idx in range(num_layers - 1): + layers += [ + gnn_layer(in_channels=in_channels, out_channels=out_channels, **kwargs), + nn.ReLU(inplace=True), + nn.Dropout(dp_rate), + ] + in_channels = c_hidden + layers += [gnn_layer(in_channels=in_channels, out_channels=c_out, **kwargs)] + self.layers = nn.ModuleList(layers) + + def forward(self, x, edge_index): + """Forward. + + Args: + x: Input features per node + edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation) + """ + for layer in self.layers: + # For graph layers, we need to add the "edge_index" tensor as additional input + # All PyTorch Geometric graph layer inherit the class "MessagePassing", hence + # we can simply check the class type. + if isinstance(layer, geom_nn.MessagePassing): + x = layer(x, edge_index) + else: + x = layer(x) + return x + + +# %% [markdown] +# Good practice in node-level tasks is to create an MLP baseline that is applied to each node independently. +# This way we can verify whether adding the graph information to the model indeed improves the prediction, or not. +# It might also be that the features per node are already expressive enough to clearly point towards a specific class. +# To check this, we implement a simple MLP below. + + +# %% +class MLPModel(nn.Module): + def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1): + """MLPModel. + + Args: + c_in: Dimension of input features + c_hidden: Dimension of hidden features + c_out: Dimension of the output features. Usually number of classes in classification + num_layers: Number of hidden layers + dp_rate: Dropout rate to apply throughout the network + """ + super().__init__() + layers = [] + in_channels, out_channels = c_in, c_hidden + for l_idx in range(num_layers - 1): + layers += [nn.Linear(in_channels, out_channels), nn.ReLU(inplace=True), nn.Dropout(dp_rate)] + in_channels = c_hidden + layers += [nn.Linear(in_channels, c_out)] + self.layers = nn.Sequential(*layers) + + def forward(self, x, *args, **kwargs): + """Forward. + + Args: + x: Input features per node + """ + return self.layers(x) + + +# %% [markdown] +# Finally, we can merge the models into a PyTorch Lightning module which handles the training, +# validation, and testing for us. + + +# %% +class NodeLevelGNN(L.LightningModule): + def __init__(self, model_name, **model_kwargs): + super().__init__() + # Saving hyperparameters + self.save_hyperparameters() + + if model_name == "MLP": + self.model = MLPModel(**model_kwargs) + else: + self.model = GNNModel(**model_kwargs) + self.loss_module = nn.CrossEntropyLoss() + + def forward(self, data, mode="train"): + x, edge_index = data.x, data.edge_index + x = self.model(x, edge_index) + + # Only calculate the loss on the nodes corresponding to the mask + if mode == "train": + mask = data.train_mask + elif mode == "val": + mask = data.val_mask + elif mode == "test": + mask = data.test_mask + else: + assert False, "Unknown forward mode: %s" % mode + + loss = self.loss_module(x[mask], data.y[mask]) + acc = (x[mask].argmax(dim=-1) == data.y[mask]).sum().float() / mask.sum() + return loss, acc + + def configure_optimizers(self): + # We use SGD here, but Adam works as well + optimizer = optim.SGD(self.parameters(), lr=0.1, momentum=0.9, weight_decay=2e-3) + return optimizer + + def training_step(self, batch, batch_idx): + loss, acc = self.forward(batch, mode="train") + self.log("train_loss", loss) + self.log("train_acc", acc) + return loss + + def validation_step(self, batch, batch_idx): + _, acc = self.forward(batch, mode="val") + self.log("val_acc", acc) + + def test_step(self, batch, batch_idx): + _, acc = self.forward(batch, mode="test") + self.log("test_acc", acc) + + +# %% [markdown] +# Additionally to the Lightning module, we define a training function below. +# As we have a single graph, we use a batch size of 1 for the data loader and share the same data loader for the train, +# validation, and test set (the mask is picked inside the Lightning module). +# Besides, we set the argument `enable_progress_bar` to False as it usually shows the progress per epoch, +# but an epoch only consists of a single step. +# If you have downloaded the pre-trained models in the beginning of the tutorial, we load those instead of training from scratch. +# Finally, we test the model and return the results. + + +# %% +def train_node_classifier(model_name, dataset, **model_kwargs): + L.seed_everything(42) + node_data_loader = geom_data.DataLoader(dataset, batch_size=1) + + # Create a PyTorch Lightning trainer + root_dir = os.path.join(CHECKPOINT_PATH, "NodeLevel" + model_name) + os.makedirs(root_dir, exist_ok=True) + trainer = L.Trainer( + default_root_dir=root_dir, + callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")], + accelerator="auto", + devices=AVAIL_GPUS, + max_epochs=200, + enable_progress_bar=False, + ) # 0 because epoch size is 1 + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "NodeLevel%s.ckpt" % model_name) + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = NodeLevelGNN.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything() + model = NodeLevelGNN( + model_name=model_name, c_in=dataset.num_node_features, c_out=dataset.num_classes, **model_kwargs + ) + trainer.fit(model, node_data_loader, node_data_loader) + model = NodeLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # Test best model on the test set + test_result = trainer.test(model, dataloaders=node_data_loader, verbose=False) + batch = next(iter(node_data_loader)) + batch = batch.to(model.device) + _, train_acc = model.forward(batch, mode="train") + _, val_acc = model.forward(batch, mode="val") + result = {"train": train_acc, "val": val_acc, "test": test_result[0]["test_acc"]} + return model, result + + +# %% [markdown] +# Now, we can train our models. First, let's train the simple MLP: + + +# %% +# Small function for printing the test scores +def print_results(result_dict): + if "train" in result_dict: + print("Train accuracy: %4.2f%%" % (100.0 * result_dict["train"])) + if "val" in result_dict: + print("Val accuracy: %4.2f%%" % (100.0 * result_dict["val"])) + print("Test accuracy: %4.2f%%" % (100.0 * result_dict["test"])) + + +# %% +node_mlp_model, node_mlp_result = train_node_classifier( + model_name="MLP", dataset=cora_dataset, c_hidden=16, num_layers=2, dp_rate=0.1 +) + +print_results(node_mlp_result) + +# %% [markdown] +# Although the MLP can overfit on the training dataset because of the high-dimensional input features, +# it does not perform too well on the test set. +# Let's see if we can beat this score with our graph networks: + +# %% +node_gnn_model, node_gnn_result = train_node_classifier( + model_name="GNN", layer_name="GCN", dataset=cora_dataset, c_hidden=16, num_layers=2, dp_rate=0.1 +) +print_results(node_gnn_result) + +# %% [markdown] +# As we would have hoped for, the GNN model outperforms the MLP by quite a margin. +# This shows that using the graph information indeed improves our predictions and lets us generalizes better. +# +# The hyperparameters in the model have been chosen to create a relatively small network. +# This is because the first layer with an input dimension of 1433 can be relatively expensive to perform for large graphs. +# In general, GNNs can become relatively expensive for very big graphs. +# This is why such GNNs either have a small hidden size or use a special batching strategy +# where we sample a connected subgraph of the big, original graph. + +# %% [markdown] +# ### Edge-level tasks: Link prediction +# +# In some applications, we might have to predict on an edge-level instead of node-level. +# The most common edge-level task in GNN is link prediction. +# Link prediction means that given a graph, we want to predict whether there will be/should be an edge between two nodes or not. +# For example, in a social network, this is used by Facebook and co to propose new friends to you. +# Again, graph level information can be crucial to perform this task. +# The output prediction is usually done by performing a similarity metric on the pair of node features, +# which should be 1 if there should be a link, and otherwise close to 0. +# To keep the tutorial short, we will not implement this task ourselves. +# Nevertheless, there are many good resources out there if you are interested in looking closer at this task. +# Tutorials and papers for this topic include: +# +# * [PyTorch Geometric example](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/link_pred.py) +# * [Graph Neural Networks: A Review of Methods and Applications](https://arxiv.org/pdf/1812.08434.pdf), Zhou et al. +# 2019 +# * [Link Prediction Based on Graph Neural Networks](https://papers.nips.cc/paper/2018/file/53f0d7c537d99b3824f0f99d62ea2428-Paper.pdf), Zhang and Chen, 2018. + +# %% [markdown] +# ### Graph-level tasks: Graph classification +# +# Finally, in this part of the tutorial, we will have a closer look at how to apply GNNs to the task of graph classification. +# The goal is to classify an entire graph instead of single nodes or edges. +# Therefore, we are also given a dataset of multiple graphs that we need to classify based on some structural graph properties. +# The most common task for graph classification is molecular property prediction, in which molecules are represented as graphs. +# Each atom is linked to a node, and edges in the graph are the bonds between atoms. +# For example, look at the figure below. +# +#
+# +# On the left, we have an arbitrary, small molecule with different atoms, whereas the right part of the image shows the graph representation. +# The atom types are abstracted as node features (e.g. a one-hot vector), and the different bond types are used as edge features. +# For simplicity, we will neglect the edge attributes in this tutorial, but you can include by using methods like the +# [Relational Graph Convolution](https://arxiv.org/abs/1703.06103) that uses a different weight matrix for each edge type. +# +# The dataset we will use below is called the MUTAG dataset. +# It is a common small benchmark for graph classification algorithms, and contain 188 graphs with 18 nodes +# and 20 edges on average for each graph. +# The graph nodes have 7 different labels/atom types, and the binary graph labels represent "their mutagenic effect +# on a specific gram negative bacterium" (the specific meaning of the labels are not too important here). +# The dataset is part of a large collection of different graph classification datasets, known as the +# [TUDatasets](https://chrsmrrs.github.io/datasets/), which is directly accessible +# via `torch_geometric.datasets.TUDataset` ([documentation](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.TUDataset)) in PyTorch Geometric. +# We can load the dataset below. + +# %% +tu_dataset = torch_geometric.datasets.TUDataset(root=DATASET_PATH, name="MUTAG") + +# %% [markdown] +# Let's look at some statistics for the dataset: + +# %% +print("Data object:", tu_dataset.data) +print("Length:", len(tu_dataset)) +print("Average label: %4.2f" % (tu_dataset.data.y.float().mean().item())) + +# %% [markdown] +# The first line shows how the dataset stores different graphs. +# The nodes, edges, and labels of each graph are concatenated to one tensor, and the dataset stores the indices +# where to split the tensors correspondingly. +# The length of the dataset is the number of graphs we have, and the "average label" +# denotes the percentage of the graph with label 1. +# As long as the percentage is in the range of 0.5, we have a relatively balanced dataset. +# It happens quite often that graph datasets are very imbalanced, hence checking the class balance +# is always a good thing to do. +# +# Next, we will split our dataset into a training and test part. +# Note that we do not use a validation set this time because of the small size of the dataset. +# Therefore, our model might overfit slightly on the validation set due to the noise of the evaluation, +# but we still get an estimate of the performance on untrained data. + +# %% +torch.manual_seed(42) +tu_dataset.shuffle() +train_dataset = tu_dataset[:150] +test_dataset = tu_dataset[150:] + +# %% [markdown] +# When using a data loader, we encounter a problem with batching $N$ graphs. +# Each graph in the batch can have a different number of nodes and edges, and hence we would require a lot of padding to obtain a single tensor. +# Torch geometric uses a different, more efficient approach: we can view the $N$ graphs in a batch as a single large graph with concatenated node and edge list. +# As there is no edge between the $N$ graphs, running GNN layers on the large graph gives us the same output as running the GNN on each graph separately. +# Visually, this batching strategy is visualized below (figure credit - PyTorch Geometric team, +# [tutorial here](https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb)). +# +#
+# +# The adjacency matrix is zero for any nodes that come from two different graphs, and otherwise according to the adjacency matrix of the individual graph. +# Luckily, this strategy is already implemented in torch geometric, and hence we can use the corresponding data loader: + +# %% +graph_train_loader = geom_data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) +graph_val_loader = geom_data.DataLoader(test_dataset, batch_size=BATCH_SIZE) # Additional loader for a larger datasets +graph_test_loader = geom_data.DataLoader(test_dataset, batch_size=BATCH_SIZE) + +# %% [markdown] +# Let's load a batch below to see the batching in action: + +# %% +batch = next(iter(graph_test_loader)) +print("Batch:", batch) +print("Labels:", batch.y[:10]) +print("Batch indices:", batch.batch[:40]) + +# %% [markdown] +# We have 38 graphs stacked together for the test dataset. +# The batch indices, stored in `batch`, show that the first 12 nodes belong to the first graph, +# the next 22 to the second graph, and so on. +# These indices are important for performing the final prediction. +# To perform a prediction over a whole graph, we usually perform a pooling operation over all nodes after running the GNN model. +# In this case, we will use the average pooling. +# Hence, we need to know which nodes should be included in which average pool. +# Using this pooling, we can already create our graph network below. +# Specifically, we re-use our class `GNNModel` from before, +# and simply add an average pool and single linear layer for the graph prediction task. + + +# %% +class GraphGNNModel(nn.Module): + def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs): + """GraphGNNModel. + + Args: + c_in: Dimension of input features + c_hidden: Dimension of hidden features + c_out: Dimension of output features (usually number of classes) + dp_rate_linear: Dropout rate before the linear layer (usually much higher than inside the GNN) + kwargs: Additional arguments for the GNNModel object + """ + super().__init__() + self.GNN = GNNModel(c_in=c_in, c_hidden=c_hidden, c_out=c_hidden, **kwargs) # Not our prediction output yet! + self.head = nn.Sequential(nn.Dropout(dp_rate_linear), nn.Linear(c_hidden, c_out)) + + def forward(self, x, edge_index, batch_idx): + """Forward. + + Args: + x: Input features per node + edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation) + batch_idx: Index of batch element for each node + """ + x = self.GNN(x, edge_index) + x = geom_nn.global_mean_pool(x, batch_idx) # Average pooling + x = self.head(x) + return x + + +# %% [markdown] +# Finally, we can create a PyTorch Lightning module to handle the training. +# It is similar to the modules we have seen before and does nothing surprising in terms of training. +# As we have a binary classification task, we use the Binary Cross Entropy loss. + + +# %% +class GraphLevelGNN(L.LightningModule): + def __init__(self, **model_kwargs): + super().__init__() + # Saving hyperparameters + self.save_hyperparameters() + + self.model = GraphGNNModel(**model_kwargs) + self.loss_module = nn.BCEWithLogitsLoss() if self.hparams.c_out == 1 else nn.CrossEntropyLoss() + + def forward(self, data, mode="train"): + x, edge_index, batch_idx = data.x, data.edge_index, data.batch + x = self.model(x, edge_index, batch_idx) + x = x.squeeze(dim=-1) + + if self.hparams.c_out == 1: + preds = (x > 0).float() + data.y = data.y.float() + else: + preds = x.argmax(dim=-1) + loss = self.loss_module(x, data.y) + acc = (preds == data.y).sum().float() / preds.shape[0] + return loss, acc + + def configure_optimizers(self): + # High lr because of small dataset and small model + optimizer = optim.AdamW(self.parameters(), lr=1e-2, weight_decay=0.0) + return optimizer + + def training_step(self, batch, batch_idx): + loss, acc = self.forward(batch, mode="train") + self.log("train_loss", loss) + self.log("train_acc", acc) + return loss + + def validation_step(self, batch, batch_idx): + _, acc = self.forward(batch, mode="val") + self.log("val_acc", acc) + + def test_step(self, batch, batch_idx): + _, acc = self.forward(batch, mode="test") + self.log("test_acc", acc) + + +# %% [markdown] +# Below we train the model on our dataset. It resembles the typical training functions we have seen so far. + + +# %% +def train_graph_classifier(model_name, **model_kwargs): + L.seed_everything(42) + + # Create a PyTorch Lightning trainer with the generation callback + root_dir = os.path.join(CHECKPOINT_PATH, "GraphLevel" + model_name) + os.makedirs(root_dir, exist_ok=True) + trainer = L.Trainer( + default_root_dir=root_dir, + callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc")], + accelerator="cuda", + devices=AVAIL_GPUS, + max_epochs=500, + enable_progress_bar=False, + ) + trainer.logger._default_hp_metric = None + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "GraphLevel%s.ckpt" % model_name) + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = GraphLevelGNN.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) + model = GraphLevelGNN( + c_in=tu_dataset.num_node_features, + c_out=1 if tu_dataset.num_classes == 2 else tu_dataset.num_classes, + **model_kwargs, + ) + trainer.fit(model, graph_train_loader, graph_val_loader) + model = GraphLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # Test best model on validation and test set + train_result = trainer.test(model, dataloaders=graph_train_loader, verbose=False) + test_result = trainer.test(model, dataloaders=graph_test_loader, verbose=False) + result = {"test": test_result[0]["test_acc"], "train": train_result[0]["test_acc"]} + return model, result + + +# %% [markdown] +# Finally, let's perform the training and testing. +# Feel free to experiment with different GNN layers, hyperparameters, etc. + +# %% +model, result = train_graph_classifier( + model_name="GraphConv", c_hidden=256, layer_name="GraphConv", num_layers=3, dp_rate_linear=0.5, dp_rate=0.0 +) + +# %% +print("Train performance: %4.2f%%" % (100.0 * result["train"])) +print("Test performance: %4.2f%%" % (100.0 * result["test"])) + +# %% [markdown] +# The test performance shows that we obtain quite good scores on an unseen part of the dataset. +# It should be noted that as we have been using the test set for validation as well, we might have overfitted slightly to this set. +# Nevertheless, the experiment shows us that GNNs can be indeed powerful to predict the properties of graphs and/or molecules. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have seen the application of neural networks to graph structures. +# We looked at how a graph can be represented (adjacency matrix or edge list), +# and discussed the implementation of common graph layers: GCN and GAT. +# The implementations showed the practical side of the layers, which is often easier than the theory. +# Finally, we experimented with different tasks, on node-, edge- and graph-level. +# Overall, we have seen that including graph information in the predictions can be crucial for achieving high performance. +# There are a lot of applications that benefit from GNNs, +# and the importance of these networks will likely increase over the next years. diff --git a/course_UvA-DL/06-graph-neural-networks/example_graph.svg b/course_UvA-DL/06-graph-neural-networks/example_graph.svg new file mode 100644 index 0000000..1d85c94 --- /dev/null +++ b/course_UvA-DL/06-graph-neural-networks/example_graph.svg @@ -0,0 +1,3 @@ + + +
1
1
4
4
3
3
2
2
diff --git a/course_UvA-DL/06-graph-neural-networks/gcn_network.png b/course_UvA-DL/06-graph-neural-networks/gcn_network.png new file mode 100644 index 0000000..d191107 Binary files /dev/null and b/course_UvA-DL/06-graph-neural-networks/gcn_network.png differ diff --git a/course_UvA-DL/06-graph-neural-networks/graph_attention.jpeg b/course_UvA-DL/06-graph-neural-networks/graph_attention.jpeg new file mode 100644 index 0000000..4f382cb Binary files /dev/null and b/course_UvA-DL/06-graph-neural-networks/graph_attention.jpeg differ diff --git a/course_UvA-DL/06-graph-neural-networks/graph_attention_MLP.svg b/course_UvA-DL/06-graph-neural-networks/graph_attention_MLP.svg new file mode 100644 index 0000000..66d219d --- /dev/null +++ b/course_UvA-DL/06-graph-neural-networks/graph_attention_MLP.svg @@ -0,0 +1,553 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/06-graph-neural-networks/graph_message_passing.svg b/course_UvA-DL/06-graph-neural-networks/graph_message_passing.svg new file mode 100644 index 0000000..35f9915 --- /dev/null +++ b/course_UvA-DL/06-graph-neural-networks/graph_message_passing.svg @@ -0,0 +1,3 @@ + + +
1
1
4
4
3
3
2
2
1
1
4
4
3
3
2
2
Message passing
Message passing
diff --git a/course_UvA-DL/06-graph-neural-networks/molecule_graph.svg b/course_UvA-DL/06-graph-neural-networks/molecule_graph.svg new file mode 100644 index 0000000..d5c8e1d --- /dev/null +++ b/course_UvA-DL/06-graph-neural-networks/molecule_graph.svg @@ -0,0 +1,434 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/06-graph-neural-networks/torch_geometric_stacking_graphs.png b/course_UvA-DL/06-graph-neural-networks/torch_geometric_stacking_graphs.png new file mode 100644 index 0000000..14bccb9 Binary files /dev/null and b/course_UvA-DL/06-graph-neural-networks/torch_geometric_stacking_graphs.png differ diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/.meta.yml b/course_UvA-DL/07-deep-energy-based-generative-models/.meta.yml new file mode 100644 index 0000000..a5d7c01 --- /dev/null +++ b/course_UvA-DL/07-deep-energy-based-generative-models/.meta.yml @@ -0,0 +1,28 @@ +title: "Tutorial 7: Deep Energy-Based Generative Models" +author: Phillip Lippe +created: 2021-07-12 +updated: 2023-03-14 +license: CC BY-SA +build: 0 +tags: + - Image +description: | + In this tutorial, we will look at energy-based deep learning models, and focus on their application as generative models. + Energy models have been a popular tool before the huge deep learning hype around 2012 hit. + However, in recent years, energy-based models have gained increasing attention because of improved training methods and tricks being proposed. + Although they are still in a research stage, they have shown to outperform strong Generative Adversarial Networks + in certain cases which have been the state of the art of generating images + ([blog post](https://ajolicoeur.wordpress.com/the-new-contender-to-gans-score-matching-with-langevin-sampling/)about strong energy-based models, + [blog post](https://medium.com/syncedreview/nvidia-open-sources-hyper-realistic-face-generator-stylegan-f346e1a73826) about the power of GANs). + Hence, it is important to be aware of energy-based models, and as the theory can be abstract sometimes, + we will show the idea of energy-based models with a lot of examples. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - tensorboard + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/.thumb.jpg b/course_UvA-DL/07-deep-energy-based-generative-models/.thumb.jpg new file mode 100644 index 0000000..32cd948 Binary files /dev/null and b/course_UvA-DL/07-deep-energy-based-generative-models/.thumb.jpg differ diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py b/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py new file mode 100644 index 0000000..8626533 --- /dev/null +++ b/course_UvA-DL/07-deep-energy-based-generative-models/Deep_Energy_Models.py @@ -0,0 +1,888 @@ +# %% [markdown] +#
+# First, let's import our standard libraries below. + +# %% +# Standard libraries +import os +import random +import urllib.request +from urllib.error import HTTPError + +# PyTorch Lightning +import lightning as L + +# Plotting +import matplotlib +import matplotlib.pyplot as plt + +# %matplotlib inline +import matplotlib_inline.backend_inline +import numpy as np + +# PyTorch +import torch +import torch.nn as nn +import torch.optim as optim +import torch.utils.data as data + +# Torchvision +import torchvision +from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint +from torchvision import transforms +from torchvision.datasets import MNIST + +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 + +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/tutorial8") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") + +# %% [markdown] +# We also have pre-trained models that we download below. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial8/" +# Files to download +pretrained_files = ["MNIST.ckpt", "tensorboards/events.out.tfevents.MNIST"] + +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if "/" in file_name: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the files manually," + " or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## Energy Models +# +# In the first part of this tutorial, we will review the theory of the energy-based models +# (the same theory has been discussed in Lecture 8). +# While most of the previous models had the goal of classification or regression, +# energy-based models are motivated from a different perspective: density estimation. +# Given a dataset with a lot of elements, we want to estimate the probability distribution over the whole data space. +# As an example, if we model images from CIFAR10, our goal would be to have a probability distribution +# over all possible images of size $32\times32\times3$ where those images have a high likelihood +# that look realistic and are one of the 10 CIFAR classes. +# Simple methods like interpolation between images don't work because images are extremely high-dimensional +# (especially for large HD images). +# Hence, we turn to deep learning methods that have performed well on complex data. +# +# However, how do we predict a probability distribution $p(\mathbf{x})$ over so many dimensions using a simple neural network? +# The problem is that we cannot just predict a score between 0 and 1, +# because a probability distribution over data needs to fulfill two properties: +# +# 1. +# The probability distribution needs to assign any possible value of +# $\mathbf{x}$ a non-negative value: $p(\mathbf{x}) \geq 0$. +# 2. +# The probability density must sum/integrate to 1 over **all** possible inputs: +# $\int_{\mathbf{x}} p(\mathbf{x}) d\mathbf{x} = 1$. +# +# Luckily, there are actually many approaches for this, and one of them are energy-based models. +# The fundamental idea of energy-based models is that you can turn any function +# that predicts values larger than zero into a probability distribution by dviding by its volume. +# Imagine we have a neural network, which has as output a single neuron, like in regression. +# We can call this network $E_{\theta}(\mathbf{x})$, where $\theta$ are our parameters of the network, +# and $\mathbf{x}$ the input data (e.g. an image). +# The output of $E_{\theta}$ is a scalar value between $-\infty$ and $\infty$. +# Now, we can use basic probability theory to *normalize* the scores of all possible inputs: +# +# $$ +# q_{\theta}(\mathbf{x}) = \frac{\exp\left(-E_{\theta}(\mathbf{x})\right)}{Z_{\theta}} \hspace{5mm}\text{where}\hspace{5mm} +# Z_{\theta} = \begin{cases} +# \int_{\mathbf{x}}\exp\left(-E_{\theta}(\mathbf{x})\right) d\mathbf{x} & \text{if }x\text{ is continuous}\\ +# \sum_{\mathbf{x}}\exp\left(-E_{\theta}(\mathbf{x})\right) & \text{if }x\text{ is discrete} +# \end{cases} +# $$ +# +# The $\exp$-function ensures that we assign a probability greater than zero to any possible input. +# We use a negative sign in front of $E$ because we call $E_{\theta}$ to be the energy function: +# data points with high likelihood have a low energy, while data points with low likelihood have a high energy. +# $Z_{\theta}$ is our normalization terms that ensures that the density integrates/sums to 1. +# We can show this by integrating over $q_{\theta}(\mathbf{x})$: +# +# $$ +# \int_{\mathbf{x}}q_{\theta}(\mathbf{x})d\mathbf{x} = +# \int_{\mathbf{x}}\frac{\exp\left(-E_{\theta}(\mathbf{x})\right)}{\int_{\mathbf{\tilde{x}}}\exp\left(-E_{\theta}(\mathbf{\tilde{x}})\right) d\mathbf{\tilde{x}}}d\mathbf{x} = +# \frac{\int_{\mathbf{x}}\exp\left(-E_{\theta}(\mathbf{x})\right)d\mathbf{x}}{\int_{\mathbf{\tilde{x}}}\exp\left(-E_{\theta}(\mathbf{\tilde{x}})\right) d\mathbf{\tilde{x}}} = 1 +# $$ +# +# Note that we call the probability distribution $q_{\theta}(\mathbf{x})$ because this is the learned distribution by the model, +# and is trained to be as close as possible to the *true*, unknown distribution $p(\mathbf{x})$. +# +# The main benefit of this formulation of the probability distribution is its great flexibility as we can choose +# $E_{\theta}$ in whatever way we like, without any constraints. +# Nevertheless, when looking at the equation above, we can see a fundamental issue: How do we calculate $Z_{\theta}$? +# There is no chance that we can calculate $Z_{\theta}$ analytically for high-dimensional input +# and/or larger neural networks, but the task requires us to know $Z_{\theta}$. +# Although we can't determine the exact likelihood of a point, there exist methods with which we can train energy-based models. +# Thus, we will look next at "Contrastive Divergence" for training the model. + +# %% [markdown] +# ### Contrastive Divergence +# +# When we train a model on generative modeling, it is usually done by maximum likelihood estimation. +# In other words, we try to maximize the likelihood of the examples in the training set. +# As the exact likelihood of a point cannot be determined due to the unknown normalization constant $Z_{\theta}$, +# we need to train energy-based models slightly different. +# We cannot just maximize the un-normalized probability $\exp(-E_{\theta}(\mathbf{x}_{\text{train}}))$ +# because there is no guarantee that $Z_{\theta}$ stays constant, or that $\mathbf{x}_{\text{train}}$ +# is becoming more likely than the others. +# However, if we base our training on comparing the likelihood of points, we can create a stable objective. +# Namely, we can re-write our maximum likelihood objective where we maximize the probability +# of $\mathbf{x}_{\text{train}}$ compared to a randomly sampled data point of our model: +# +# $$ +# \begin{split} +# \nabla_{\theta}\mathcal{L}_{\text{MLE}}(\mathbf{\theta};p) & = -\mathbb{E}_{p(\mathbf{x})}\left[\nabla_{\theta}\log q_{\theta}(\mathbf{x})\right]\\[5pt] +# & = \mathbb{E}_{p(\mathbf{x})}\left[\nabla_{\theta}E_{\theta}(\mathbf{x})\right] - \mathbb{E}_{q_{\theta}(\mathbf{x})}\left[\nabla_{\theta}E_{\theta}(\mathbf{x})\right] +# \end{split} +# $$ +# +# Note that the loss is still an objective we want to minimize. +# Thus, we try to minimize the energy for data points from the dataset, while maximizing the energy for randomly +# sampled data points from our model (how we sample will be explained below). +# Although this objective sounds intuitive, how is it actually derived from our original distribution $q_{\theta}(\mathbf{x})$? +# The trick is that we approximate $Z_{\theta}$ by a single Monte-Carlo sample. +# This gives us the exact same objective as written above. +# +# Visually, we can look at the objective as follows (figure credit +# - [Stefano Ermon and Aditya Grover](https://deepgenerativemodels.github.io/assets/slides/cs236_lecture11.pdf)): +# +#
+# +# $f_{\theta}$ represents $\exp(-E_{\theta}(\mathbf{x}))$ in our case. +# The point on the right, called "correct answer", represents a data point from the dataset +# (i.e. $x_{\text{train}}$), and the left point, "wrong answer", a sample from our model (i.e. $x_{\text{sample}}$). +# Thus, we try to "pull up" the probability of the data points in the dataset, +# while "pushing down" randomly sampled points. +# The two forces for pulling and pushing are in balance iff $q_{\theta}(\mathbf{x})=p(\mathbf{x})$. + +# %% [markdown] +# ### Sampling from Energy-Based Models +# +# For sampling from an energy-based model, we can apply a Markov Chain Monte Carlo using Langevin Dynamics. +# The idea of the algorithm is to start from a random point, and slowly move towards the direction +# of higher probability using the gradients of $E_{\theta}$. +# Nevertheless, this is not enough to fully capture the probability distribution. +# We need to add noise $\omega$ at each gradient step to the current sample. +# Under certain conditions such as that we perform the gradient steps an infinite amount of times, +# we would be able to create an exact sample from our modeled distribution. +# However, as this is not practically possible, we usually limit the chain to $K$ steps +# ($K$ a hyperparameter that needs to be finetuned). +# Overall, the sampling procedure can be summarized in the following algorithm: +# +#
+ +# %% [markdown] +# ### Applications of Energy-based models beyond generation +# +# Modeling the probability distribution for sampling new data is not the only application of energy-based models. +# Any application which requires us to compare two elements is much simpler to learn +# because we just need to go for the higher energy. +# A couple of examples are shown below (figure credit +# - [Stefano Ermon and Aditya Grover](https://deepgenerativemodels.github.io/assets/slides/cs236_lecture11.pdf)). +# A classification setup like object recognition or sequence labeling can be considered as an energy-based +# task as we just need to find the $Y$ input that minimizes the output $E(X, Y)$ (hence maximizes probability). +# Similarly, a popular application of energy-based models is denoising of images. +# Given an image $X$ with a lot of noise, we try to minimize the energy by finding the true input image $Y$. +# +#
+# +# Nonetheless, we will focus on generative modeling here as in the next couple of lectures, +# we will discuss more generative deep learning approaches. + +# %% [markdown] +# ## Image generation +# +#
+# +# As an example for energy-based models, we will train a model on image generation. +# Specifically, we will look at how we can generate MNIST digits with a very simple CNN model. +# However, it should be noted that energy models are not easy to train and often diverge +# if the hyperparameters are not well tuned. +# We will rely on training tricks proposed in the paper +# [Implicit Generation and Generalization in Energy-Based Models](https://arxiv.org/abs/1903.08689) +# by Yilun Du and Igor Mordatch ([blog](https://openai.com/blog/energy-based-models/)). +# The important part of this notebook is however to see how the theory above can actually be used in a model. +# +# ### Dataset +# +# First, we can load the MNIST dataset below. +# Note that we need to normalize the images between -1 and 1 instead of mean 0 and std 1 because during sampling, +# we have to limit the input space. +# Scaling between -1 and 1 makes it easier to implement it. + +# %% +# Transformations applied on each image => make them a tensor and normalize between -1 and 1 +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) + +# Loading the training dataset. We need to split it into a training and validation part +train_set = MNIST(root=DATASET_PATH, train=True, transform=transform, download=True) + +# Loading the test set +test_set = MNIST(root=DATASET_PATH, train=False, transform=transform, download=True) + +# We define a set of data loaders that we can use for various purposes later. +# Note that for actually training a model, we will use different data loaders +# with a lower batch size. +train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, num_workers=4, pin_memory=True) +test_loader = data.DataLoader(test_set, batch_size=256, shuffle=False, drop_last=False, num_workers=4) + +# %% [markdown] +# ### CNN Model +# +# First, we implement our CNN model. +# The MNIST images are of size 28x28, hence we only need a small model. +# As an example, we will apply several convolutions with stride 2 that downscale the images. +# If you are interested, you can also use a deeper model such as a small ResNet, but for simplicity, +# we will stick with the tiny network. +# +# It is a good practice to use a smooth activation function like Swish instead of ReLU in the energy model. +# This is because we will rely on the gradients we get back with respect to the input image, which should not be sparse. + + +# %% +class CNNModel(nn.Module): + def __init__(self, hidden_features=32, out_dim=1, **kwargs): + super().__init__() + # We increase the hidden dimension over layers. Here pre-calculated for simplicity. + c_hid1 = hidden_features // 2 + c_hid2 = hidden_features + c_hid3 = hidden_features * 2 + + # Series of convolutions and Swish activation functions + self.cnn_layers = nn.Sequential( + nn.Conv2d(1, c_hid1, kernel_size=5, stride=2, padding=4), # [16x16] - Larger padding to get 32x32 image + nn.SiLU(), + nn.Conv2d(c_hid1, c_hid2, kernel_size=3, stride=2, padding=1), # [8x8] + nn.SiLU(), + nn.Conv2d(c_hid2, c_hid3, kernel_size=3, stride=2, padding=1), # [4x4] + nn.SiLU(), + nn.Conv2d(c_hid3, c_hid3, kernel_size=3, stride=2, padding=1), # [2x2] + nn.SiLU(), + nn.Flatten(), + nn.Linear(c_hid3 * 4, c_hid3), + nn.SiLU(), + nn.Linear(c_hid3, out_dim), + ) + + def forward(self, x): + x = self.cnn_layers(x).squeeze(dim=-1) + return x + + +# %% [markdown] +# In the rest of the notebook, the output of the model will actually not represent +# $E_{\theta}(\mathbf{x})$, but $-E_{\theta}(\mathbf{x})$. +# This is a standard implementation practice for energy-based models, as some people also write the energy probability +# density as $q_{\theta}(\mathbf{x}) = \frac{\exp\left(f_{\theta}(\mathbf{x})\right)}{Z_{\theta}}$. +# In that case, the model would actually represent $f_{\theta}(\mathbf{x})$. +# In the training loss etc., we need to be careful to not switch up the signs. + +# %% [markdown] +# ### Sampling buffer +# +# In the next part, we look at the training with sampled elements. +# To use the contrastive divergence objective, we need to generate samples during training. +# Previous work has shown that due to the high dimensionality of images, we need a lot of iterations +# inside the MCMC sampling to obtain reasonable samples. +# However, there is a training trick that significantly reduces the sampling cost: using a sampling buffer. +# The idea is that we store the samples of the last couple of batches in a buffer, +# and re-use those as the starting point of the MCMC algorithm for the next batches. +# This reduces the sampling cost because the model requires a significantly +# lower number of steps to converge to reasonable samples. +# However, to not solely rely on previous samples and allow novel samples as well, +# we re-initialize 5% of our samples from scratch (random noise between -1 and 1). +# +# Below, we implement the sampling buffer. +# The function `sample_new_exmps` returns a new batch of "fake" images. +# We refer to those as fake images because they have been generated, but are not actually part of the dataset. +# As mentioned before, we use initialize 5% randomly, and 95% are randomly picked from our buffer. +# On this initial batch, we perform MCMC for 60 iterations to improve the image quality +# and come closer to samples from $q_{\theta}(\mathbf{x})$. +# In the function `generate_samples`, we implemented the MCMC for images. +# Note that the hyperparameters of `step_size`, `steps`, the noise standard deviation +# $\sigma$ are specifically set for MNIST, and need to be finetuned for a different dataset if you want to use such. + + +# %% +class Sampler: + def __init__(self, model, img_shape, sample_size, max_len=8192): + """Sampler. + + Args: + model: Neural network to use for modeling E_theta + img_shape: Shape of the images to model + sample_size: Batch size of the samples + max_len: Maximum number of data points to keep in the buffer + """ + super().__init__() + self.model = model + self.img_shape = img_shape + self.sample_size = sample_size + self.max_len = max_len + self.examples = [(torch.rand((1,) + img_shape) * 2 - 1) for _ in range(self.sample_size)] + + def sample_new_exmps(self, steps=60, step_size=10): + """Function for getting a new batch of "fake" images. + + Args: + steps: Number of iterations in the MCMC algorithm + step_size: Learning rate nu in the algorithm above + """ + # Choose 95% of the batch from the buffer, 5% generate from scratch + n_new = np.random.binomial(self.sample_size, 0.05) + rand_imgs = torch.rand((n_new,) + self.img_shape) * 2 - 1 + old_imgs = torch.cat(random.choices(self.examples, k=self.sample_size - n_new), dim=0) + inp_imgs = torch.cat([rand_imgs, old_imgs], dim=0).detach().to(device) + + # Perform MCMC sampling + inp_imgs = Sampler.generate_samples(self.model, inp_imgs, steps=steps, step_size=step_size) + + # Add new images to the buffer and remove old ones if needed + self.examples = list(inp_imgs.to(torch.device("cpu")).chunk(self.sample_size, dim=0)) + self.examples + self.examples = self.examples[: self.max_len] + return inp_imgs + + @staticmethod + def generate_samples(model, inp_imgs, steps=60, step_size=10, return_img_per_step=False): + """Function for sampling images for a given model. + + Args: + model: Neural network to use for modeling E_theta + inp_imgs: Images to start from for sampling. If you want to generate new images, enter noise between -1 and 1. + steps: Number of iterations in the MCMC algorithm. + step_size: Learning rate nu in the algorithm above + return_img_per_step: If True, we return the sample at every iteration of the MCMC + """ + # Before MCMC: set model parameters to "required_grad=False" + # because we are only interested in the gradients of the input. + is_training = model.training + model.eval() + for p in model.parameters(): + p.requires_grad = False + inp_imgs.requires_grad = True + + # Enable gradient calculation if not already the case + had_gradients_enabled = torch.is_grad_enabled() + torch.set_grad_enabled(True) + + # We use a buffer tensor in which we generate noise each loop iteration. + # More efficient than creating a new tensor every iteration. + noise = torch.randn(inp_imgs.shape, device=inp_imgs.device) + + # List for storing generations at each step (for later analysis) + imgs_per_step = [] + + # Loop over K (steps) + for _ in range(steps): + # Part 1: Add noise to the input. + noise.normal_(0, 0.005) + inp_imgs.data.add_(noise.data) + inp_imgs.data.clamp_(min=-1.0, max=1.0) + + # Part 2: calculate gradients for the current input. + out_imgs = -model(inp_imgs) + out_imgs.sum().backward() + inp_imgs.grad.data.clamp_(-0.03, 0.03) # For stabilizing and preventing too high gradients + + # Apply gradients to our current samples + inp_imgs.data.add_(-step_size * inp_imgs.grad.data) + inp_imgs.grad.detach_() + inp_imgs.grad.zero_() + inp_imgs.data.clamp_(min=-1.0, max=1.0) + + if return_img_per_step: + imgs_per_step.append(inp_imgs.clone().detach()) + + # Reactivate gradients for parameters for training + for p in model.parameters(): + p.requires_grad = True + model.train(is_training) + + # Reset gradient calculation to setting before this function + torch.set_grad_enabled(had_gradients_enabled) + + if return_img_per_step: + return torch.stack(imgs_per_step, dim=0) + else: + return inp_imgs + + +# %% [markdown] +# The idea of the buffer becomes a bit clearer in the following algorithm. + +# %% [markdown] +# ### Training algorithm +# +# With the sampling buffer being ready, we can complete our training algorithm. +# Below is shown a summary of the full training algorithm of an energy model on image modeling: +# +#
+# +# The first few statements in each training iteration concern the sampling of the real and fake data, +# as we have seen above with the sample buffer. +# Next, we calculate the contrastive divergence objective using our energy model $E_{\theta}$. +# However, one additional training trick we need is to add a regularization loss on the output of $E_{\theta}$. +# As the output of the network is not constrained and adding a large bias or not to the output +# doesn't change the contrastive divergence loss, we need to ensure somehow else that the output values are in a reasonable range. +# Without the regularization loss, the output values will fluctuate in a very large range. +# With this, we ensure that the values for the real data are around 0, and the fake data likely slightly lower +# (for noise or outliers the score can be still significantly lower). +# As the regularization loss is less important than the Contrastive Divergence, we have a weight factor +# $\alpha$ which is usually quite some smaller than 1. +# Finally, we perform an update step with an optimizer on the combined loss and add the new samples to the buffer. +# +# Below, we put this training dynamic into a PyTorch Lightning module: + + +# %% +class DeepEnergyModel(L.LightningModule): + def __init__(self, img_shape, batch_size, alpha=0.1, lr=1e-4, beta1=0.0, **CNN_args): + super().__init__() + self.save_hyperparameters() + + self.cnn = CNNModel(**CNN_args) + self.sampler = Sampler(self.cnn, img_shape=img_shape, sample_size=batch_size) + self.example_input_array = torch.zeros(1, *img_shape) + + def forward(self, x): + z = self.cnn(x) + return z + + def configure_optimizers(self): + # Energy models can have issues with momentum as the loss surfaces changes with its parameters. + # Hence, we set it to 0 by default. + optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr, betas=(self.hparams.beta1, 0.999)) + scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.97) # Exponential decay over epochs + return [optimizer], [scheduler] + + def training_step(self, batch, batch_idx): + # We add minimal noise to the original images to prevent the model from focusing on purely "clean" inputs + real_imgs, _ = batch + small_noise = torch.randn_like(real_imgs) * 0.005 + real_imgs.add_(small_noise).clamp_(min=-1.0, max=1.0) + + # Obtain samples + fake_imgs = self.sampler.sample_new_exmps(steps=60, step_size=10) + + # Predict energy score for all images + inp_imgs = torch.cat([real_imgs, fake_imgs], dim=0) + real_out, fake_out = self.cnn(inp_imgs).chunk(2, dim=0) + + # Calculate losses + reg_loss = self.hparams.alpha * (real_out**2 + fake_out**2).mean() + cdiv_loss = fake_out.mean() - real_out.mean() + loss = reg_loss + cdiv_loss + + # Logging + self.log("loss", loss) + self.log("loss_regularization", reg_loss) + self.log("loss_contrastive_divergence", cdiv_loss) + self.log("metrics_avg_real", real_out.mean()) + self.log("metrics_avg_fake", fake_out.mean()) + return loss + + def validation_step(self, batch, batch_idx): + # For validating, we calculate the contrastive divergence between purely random images and unseen examples + # Note that the validation/test step of energy-based models depends on what we are interested in the model + real_imgs, _ = batch + fake_imgs = torch.rand_like(real_imgs) * 2 - 1 + + inp_imgs = torch.cat([real_imgs, fake_imgs], dim=0) + real_out, fake_out = self.cnn(inp_imgs).chunk(2, dim=0) + + cdiv = fake_out.mean() - real_out.mean() + self.log("val_contrastive_divergence", cdiv) + self.log("val_fake_out", fake_out.mean()) + self.log("val_real_out", real_out.mean()) + + +# %% [markdown] +# We do not implement a test step because energy-based, generative models are usually not evaluated on a test set. +# The validation step however is used to get an idea of the difference between ennergy/likelihood +# of random images to unseen examples of the dataset. + +# %% [markdown] +# ### Callbacks +# +# To track the performance of our model during training, we will make extensive use of PyTorch Lightning's callback framework. +# Remember that callbacks can be used for running small functions at any point of the training, +# for instance after finishing an epoch. +# Here, we will use three different callbacks we define ourselves. +# +# The first callback, called `GenerateCallback`, is used for adding image generations to the model during training. +# After every $N$ epochs (usually $N=5$ to reduce output to TensorBoard), we take a small batch +# of random images and perform many MCMC iterations until the model's generation converges. +# Compared to the training that used 60 iterations, we use 256 here because +# (1) we only have to do it once compared to the training that has to do it every iteration, and +# (2) we do not start from a buffer here, but from scratch. +# It is implemented as follows: + + +# %% +class GenerateCallback(Callback): + def __init__(self, batch_size=8, vis_steps=8, num_steps=256, every_n_epochs=5): + super().__init__() + self.batch_size = batch_size # Number of images to generate + self.vis_steps = vis_steps # Number of steps within generation to visualize + self.num_steps = num_steps # Number of steps to take during generation + # Only save those images every N epochs (otherwise tensorboard gets quite large) + self.every_n_epochs = every_n_epochs + + def on_epoch_end(self, trainer, pl_module): + # Skip for all other epochs + if trainer.current_epoch % self.every_n_epochs == 0: + # Generate images + imgs_per_step = self.generate_imgs(pl_module) + # Plot and add to tensorboard + for i in range(imgs_per_step.shape[1]): + step_size = self.num_steps // self.vis_steps + imgs_to_plot = imgs_per_step[step_size - 1 :: step_size, i] + grid = torchvision.utils.make_grid( + imgs_to_plot, nrow=imgs_to_plot.shape[0], normalize=True, range=(-1, 1) + ) + trainer.logger.experiment.add_image("generation_%i" % i, grid, global_step=trainer.current_epoch) + + def generate_imgs(self, pl_module): + pl_module.eval() + start_imgs = torch.rand((self.batch_size,) + pl_module.hparams["img_shape"]).to(pl_module.device) + start_imgs = start_imgs * 2 - 1 + imgs_per_step = Sampler.generate_samples( + pl_module.cnn, start_imgs, steps=self.num_steps, step_size=10, return_img_per_step=True + ) + pl_module.train() + return imgs_per_step + + +# %% [markdown] +# The second callback is called `SamplerCallback`, and simply adds a randomly picked subset of images +# in the sampling buffer to the TensorBoard. +# This helps to understand what images are currently shown to the model as "fake". + + +# %% +class SamplerCallback(Callback): + def __init__(self, num_imgs=32, every_n_epochs=5): + super().__init__() + self.num_imgs = num_imgs # Number of images to plot + # Only save those images every N epochs (otherwise tensorboard gets quite large) + self.every_n_epochs = every_n_epochs + + def on_epoch_end(self, trainer, pl_module): + if trainer.current_epoch % self.every_n_epochs == 0: + exmp_imgs = torch.cat(random.choices(pl_module.sampler.examples, k=self.num_imgs), dim=0) + grid = torchvision.utils.make_grid(exmp_imgs, nrow=4, normalize=True, range=(-1, 1)) + trainer.logger.experiment.add_image("sampler", grid, global_step=trainer.current_epoch) + + +# %% [markdown] +# Finally, our last callback is `OutlierCallback`. +# This callback evaluates the model by recording the (negative) energy assigned to random noise. +# While our training loss is almost constant across iterations, +# this score is likely showing the progress of the model to detect "outliers". + + +# %% +class OutlierCallback(Callback): + def __init__(self, batch_size=1024): + super().__init__() + self.batch_size = batch_size + + def on_epoch_end(self, trainer, pl_module): + with torch.no_grad(): + pl_module.eval() + rand_imgs = torch.rand((self.batch_size,) + pl_module.hparams["img_shape"]).to(pl_module.device) + rand_imgs = rand_imgs * 2 - 1.0 + rand_out = pl_module.cnn(rand_imgs).mean() + pl_module.train() + + trainer.logger.experiment.add_scalar("rand_out", rand_out, global_step=trainer.current_epoch) + + +# %% [markdown] +# ### Running the model +# +# Finally, we can add everything together to create our final training function. +# The function is very similar to any other PyTorch Lightning training function we have seen so far. +# However, there is the small difference of that we do not test the model on a test set +# because we will analyse the model afterward by checking its prediction and ability to perform outlier detection. + + +# %% +def train_model(**kwargs): + # Create a PyTorch Lightning trainer with the generation callback + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "MNIST"), + accelerator="auto", + devices=1, + max_epochs=60, + gradient_clip_val=0.1, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="min", monitor="val_contrastive_divergence"), + GenerateCallback(every_n_epochs=5), + SamplerCallback(every_n_epochs=5), + OutlierCallback(), + LearningRateMonitor("epoch"), + ], + ) + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "MNIST.ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = DeepEnergyModel.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) + model = DeepEnergyModel(**kwargs) + trainer.fit(model, train_loader, test_loader) + model = DeepEnergyModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + # No testing as we are more interested in other properties + return model + + +# %% +model = train_model(img_shape=(1, 28, 28), batch_size=train_loader.batch_size, lr=1e-4, beta1=0.0) + +# %% [markdown] +# ## Analysis +# +# In the last part of the notebook, we will try to take the trained energy-based generative model, +# and analyse its properties. + +# %% [markdown] +# ### TensorBoard +# +# The first thing we can look at is the TensorBoard generate during training. +# This can help us to understand the training dynamic even better, and shows potential issues. +# Let's load the TensorBoard below: + +# %% +# Uncomment the following two lines to open a tensorboard in the notebook. +# Adjust the path to your CHECKPOINT_PATH if needed. +# %load_ext tensorboard +# %tensorboard --logdir ../saved_models/tutorial8/tensorboards/ + +# %% [markdown] +#
+ +# %% [markdown] +# We see that the contrastive divergence as well as the regularization converge quickly to 0. +# However, the training continues although the loss is always close to zero. +# This is because our "training" data changes with the model by sampling. +# The progress of training can be best measured by looking at the samples across iterations, +# and the score for random images that decreases constantly over time. + +# %% [markdown] +# ### Image Generation +# +# Another way of evaluating generative models is by sampling a few generated images. +# Generative models need to be good at generating realistic images as this truely shows that they have modeled the true data distribution. +# Thus, let's sample a few images of the model below: + +# %% +model.to(device) +L.seed_everything(43) +callback = GenerateCallback(batch_size=4, vis_steps=8, num_steps=256) +imgs_per_step = callback.generate_imgs(model) +imgs_per_step = imgs_per_step.cpu() + +# %% [markdown] +# The characteristic of sampling with energy-based models is that they require the iterative MCMC algorithm. +# To gain an insight in how the images change over iterations, we plot a few intermediate samples in the MCMC as well: + +# %% +for i in range(imgs_per_step.shape[1]): + step_size = callback.num_steps // callback.vis_steps + imgs_to_plot = imgs_per_step[step_size - 1 :: step_size, i] + imgs_to_plot = torch.cat([imgs_per_step[0:1, i], imgs_to_plot], dim=0) + grid = torchvision.utils.make_grid( + imgs_to_plot, nrow=imgs_to_plot.shape[0], normalize=True, range=(-1, 1), pad_value=0.5, padding=2 + ) + grid = grid.permute(1, 2, 0) + plt.figure(figsize=(8, 8)) + plt.imshow(grid) + plt.xlabel("Generation iteration") + plt.xticks( + [(imgs_per_step.shape[-1] + 2) * (0.5 + j) for j in range(callback.vis_steps + 1)], + labels=[1] + list(range(step_size, imgs_per_step.shape[0] + 1, step_size)), + ) + plt.yticks([]) + plt.show() + +# %% [markdown] +# We see that although starting from noise in the very first step, the sampling algorithm obtains reasonable shapes after only 32 steps. +# Over the next 200 steps, the shapes become clearer and changed towards realistic digits. +# The specific samples can differ when you run the code on Colab, hence the following description is specific to the plots shown on the website. +# The first row shows an 8, where we remove unnecessary white parts over iterations. +# The transformation across iterations can be seen at best for the second sample, which creates a digit of 2. +# While the first sample after 32 iterations looks a bit like a digit, but not really, +# the sample is transformed more and more to a typical image of the digit 2. + +# %% [markdown] +# ### Out-of-distribution detection +# +# A very common and strong application of energy-based models is out-of-distribution detection +# (sometimes referred to as "anomaly" detection). +# As more and more deep learning models are applied in production and applications, +# a crucial aspect of these models is to know what the models don't know. +# Deep learning models are usually overconfident, meaning that they classify even random images sometimes with 100% probability. +# Clearly, this is not something that we want to see in applications. +# Energy-based models can help with this problem because they are trained to detect images that do not fit the training dataset distribution. +# Thus, in those applications, you could train an energy-based model along with the classifier, +# and only output predictions if the energy-based models assign a (unnormalized) probability higher than $\delta$ to the image. +# You can actually combine classifiers and energy-based objectives in a single model, +# as proposed in this [paper](https://arxiv.org/abs/1912.03263). +# +# In this part of the analysis, we want to test the out-of-distribution capability of our energy-based model. +# Remember that a lower output of the model denotes a low probability. +# Thus, we hope to see low scores if we enter random noise to the model: + +# %% +with torch.no_grad(): + rand_imgs = torch.rand((128,) + model.hparams.img_shape).to(model.device) + rand_imgs = rand_imgs * 2 - 1.0 + rand_out = model.cnn(rand_imgs).mean() + print("Average score for random images: %4.2f" % (rand_out.item())) + +# %% [markdown] +# As we hoped, the model assigns very low probability to those noisy images. +# As another reference, let's look at predictions for a batch of images from the training set: + +# %% +with torch.no_grad(): + train_imgs, _ = next(iter(train_loader)) + train_imgs = train_imgs.to(model.device) + train_out = model.cnn(train_imgs).mean() + print("Average score for training images: %4.2f" % (train_out.item())) + +# %% [markdown] +# The scores are close to 0 because of the regularization objective that was added to the training. +# So clearly, the model can distinguish between noise and real digits. +# However, what happens if we change the training images a little, and see which ones gets a very low score? + + +# %% +@torch.no_grad() +def compare_images(img1, img2): + imgs = torch.stack([img1, img2], dim=0).to(model.device) + score1, score2 = model.cnn(imgs).cpu().chunk(2, dim=0) + grid = torchvision.utils.make_grid( + [img1.cpu(), img2.cpu()], nrow=2, normalize=True, range=(-1, 1), pad_value=0.5, padding=2 + ) + grid = grid.permute(1, 2, 0) + plt.figure(figsize=(4, 4)) + plt.imshow(grid) + plt.xticks([(img1.shape[2] + 2) * (0.5 + j) for j in range(2)], labels=["Original image", "Transformed image"]) + plt.yticks([]) + plt.show() + print("Score original image: %4.2f" % score1) + print("Score transformed image: %4.2f" % score2) + + +# %% [markdown] +# We use a random test image for this. Feel free to change it to experiment with the model yourself. + +# %% +test_imgs, _ = next(iter(test_loader)) +exmp_img = test_imgs[0].to(model.device) + +# %% [markdown] +# The first transformation is to add some random noise to the image: + +# %% +img_noisy = exmp_img + torch.randn_like(exmp_img) * 0.3 +img_noisy.clamp_(min=-1.0, max=1.0) +compare_images(exmp_img, img_noisy) + +# %% [markdown] +# We can see that the score considerably drops. +# Hence, the model can detect random Gaussian noise on the image. +# This is also to expect as initially, the "fake" samples are pure noise images. +# +# Next, we flip an image and check how this influences the score: + +# %% +img_flipped = exmp_img.flip(dims=(1, 2)) +compare_images(exmp_img, img_flipped) + +# %% [markdown] +# If the digit can only be read in this way, for example, the 7, then we can see that the score drops. +# However, the score only drops slightly. +# This is likely because of the small size of our model. +# Keep in mind that generative modeling is a much harder task than classification, +# as we do not only need to distinguish between classes but learn **all** details/characteristics of the digits. +# With a deeper model, this could eventually be captured better (but at the cost of greater training instability). +# +# Finally, we check what happens if we reduce the digit significantly in size: + +# %% +img_tiny = torch.zeros_like(exmp_img) - 1 +img_tiny[:, exmp_img.shape[1] // 2 :, exmp_img.shape[2] // 2 :] = exmp_img[:, ::2, ::2] +compare_images(exmp_img, img_tiny) + +# %% [markdown] +# The score again drops but not by a large margin, although digits in the MNIST dataset usually are much larger. +# +# Overall, we can conclude that our model is good for detecting Gaussian noise and smaller transformations to existing digits. +# Nonetheless, to obtain a very good out-of-distribution model, we would need to train deeper models and for more iterations. + +# %% [markdown] +# ### Instability +# +# Finally, we should discuss the possible instabilities of energy-based models, +# in particular for the example of image generation that we have implemented in this notebook. +# In the process of hyperparameter search for this notebook, there have been several models that diverged. +# Divergence in energy-based models means that the models assign a high probability to examples of the training set which is a good thing. +# However, at the same time, the sampling algorithm fails and only generates noise images that obtain minimal probability scores. +# This happens because the model has created many local maxima in which the generated noise images fall. +# The energy surface over which we calculate the gradients to reach data points with high probability has "diverged" and is not useful for our MCMC sampling. +# +# Besides finding the optimal hyperparameters, a common trick in energy-based models is to reload stable checkpoints. +# If we detect that the model is diverging, we stop the training, load the model from one epoch ago where it did not diverge yet. +# Afterward, we continue training and hope that with a different seed the model is not diverging again. +# Nevertheless, this should be considered as the "last hope" for stabilizing the models, +# and careful hyperparameter tuning is the better way to do so. +# Sensitive hyperparameters include `step_size`, `steps` and the noise standard deviation in the sampler, +# and the learning rate and feature dimensionality in the CNN model. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have discussed energy-based models for generative modeling. +# The concept relies on the idea that any strictly positive function can be turned into a probability +# distribution by normalizing over the whole dataset. +# As this is not reasonable to calculate for high dimensional data like images, +# we train the model using contrastive divergence and sampling via MCMC. +# While the idea allows us to turn any neural network into an energy-based model, +# we have seen that there are multiple training tricks needed to stabilize the training. +# Furthermore, the training time of these models is relatively long as, during every training iteration, +# we need to sample new "fake" images, even with a sampling buffer. +# In the next lectures and assignment, we will see different generative models (e.g. VAE, GAN, NF) +# that allow us to do generative modeling more stably, but with the cost of more parameters. diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/contrastive_divergence.svg b/course_UvA-DL/07-deep-energy-based-generative-models/contrastive_divergence.svg new file mode 100644 index 0000000..bc084ae --- /dev/null +++ b/course_UvA-DL/07-deep-energy-based-generative-models/contrastive_divergence.svg @@ -0,0 +1,84 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/energy_models_application.png b/course_UvA-DL/07-deep-energy-based-generative-models/energy_models_application.png new file mode 100644 index 0000000..71fee99 Binary files /dev/null and b/course_UvA-DL/07-deep-energy-based-generative-models/energy_models_application.png differ diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/sampling.svg b/course_UvA-DL/07-deep-energy-based-generative-models/sampling.svg new file mode 100644 index 0000000..b6fbd2a --- /dev/null +++ b/course_UvA-DL/07-deep-energy-based-generative-models/sampling.svg @@ -0,0 +1,2562 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/tensorboard_screenshot.png b/course_UvA-DL/07-deep-energy-based-generative-models/tensorboard_screenshot.png new file mode 100644 index 0000000..e514684 Binary files /dev/null and b/course_UvA-DL/07-deep-energy-based-generative-models/tensorboard_screenshot.png differ diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/training_algorithm.svg b/course_UvA-DL/07-deep-energy-based-generative-models/training_algorithm.svg new file mode 100644 index 0000000..b1c76d6 --- /dev/null +++ b/course_UvA-DL/07-deep-energy-based-generative-models/training_algorithm.svg @@ -0,0 +1,5567 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/08-deep-autoencoders/.meta.yml b/course_UvA-DL/08-deep-autoencoders/.meta.yml new file mode 100644 index 0000000..5c4df67 --- /dev/null +++ b/course_UvA-DL/08-deep-autoencoders/.meta.yml @@ -0,0 +1,28 @@ +title: "Tutorial 8: Deep Autoencoders" +author: Phillip Lippe +created: 2021-07-12 +updated: 2023-03-14 +license: CC BY-SA +build: 0 +tags: + - Image +description: | + In this tutorial, we will take a closer look at autoencoders (AE). + Autoencoders are trained on encoding input data such as images into a smaller feature vector, + and afterward, reconstruct it by a second neural network, called a decoder. + The feature vector is called the "bottleneck" of the network as we aim to compress the input data into a smaller amount of features. + This property is useful in many applications, in particular in compressing data or comparing images on a metric beyond pixel-level comparisons. + Besides learning about the autoencoder framework, we will also see the "deconvolution" + (or transposed convolution) operator in action for scaling up feature maps in height and width. + Such deconvolution networks are necessary wherever we start from a small feature vector + and need to output an image of full size (e.g. in VAE, GANs, or super-resolution applications). + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/08-deep-autoencoders/.thumb.jpg b/course_UvA-DL/08-deep-autoencoders/.thumb.jpg new file mode 100644 index 0000000..1b07169 Binary files /dev/null and b/course_UvA-DL/08-deep-autoencoders/.thumb.jpg differ diff --git a/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py b/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py new file mode 100644 index 0000000..6d4fbf6 --- /dev/null +++ b/course_UvA-DL/08-deep-autoencoders/Deep_Autoencoders.py @@ -0,0 +1,715 @@ +# %% [markdown] +#
+ +# %% +import os +import urllib.request +from urllib.error import HTTPError + +import lightning as L +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import seaborn as sns +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from lightning.pytorch.callbacks import Callback, LearningRateMonitor, ModelCheckpoint +from torch.utils.tensorboard import SummaryWriter +from torchvision import transforms +from torchvision.datasets import CIFAR10 +from tqdm.notebook import tqdm + +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.reset_orig() +sns.set() + +# Tensorboard extension (for visualization purposes later) +# %load_ext tensorboard + +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/tutorial9") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") +print("Device:", device) + +# %% [markdown] +# We have 4 pretrained models that we have to download. +# Remember the adjust the variables `DATASET_PATH` and `CHECKPOINT_PATH` if needed. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial9/" +# Files to download +pretrained_files = ["cifar10_64.ckpt", "cifar10_128.ckpt", "cifar10_256.ckpt", "cifar10_384.ckpt"] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the files manually," + " or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# In this tutorial, we work with the CIFAR10 dataset. +# In CIFAR10, each image has 3 color channels and is 32x32 pixels large. +# As autoencoders do not have the constrain of modeling images probabilistic, we can work on more complex image data +# (i.e. 3 color channels instead of black-and-white) much easier than for VAEs. +# In case you have downloaded CIFAR10 already in a different directory, make sure to set DATASET_PATH +# accordingly to prevent another download. +# +# In contrast to previous tutorials on CIFAR10 like +# [Tutorial 5](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial5/Inception_ResNet_DenseNet.html) +# (CNN classification), we do not normalize the data explicitly with a mean of 0 and std of 1, +# but roughly estimate it scaling the data between -1 and 1. +# This is because limiting the range will make our task of predicting/reconstructing images easier. + +# %% +# Transformations applied on each image => only make them a tensor +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) + +# Loading the training dataset. We need to split it into a training and validation part +train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=transform, download=True) +L.seed_everything(42) +train_set, val_set = torch.utils.data.random_split(train_dataset, [45000, 5000]) + +# Loading the test set +test_set = CIFAR10(root=DATASET_PATH, train=False, transform=transform, download=True) + +# We define a set of data loaders that we can use for various purposes later. +train_loader = data.DataLoader(train_set, batch_size=256, shuffle=True, drop_last=True, pin_memory=True, num_workers=4) +val_loader = data.DataLoader(val_set, batch_size=256, shuffle=False, drop_last=False, num_workers=4) +test_loader = data.DataLoader(test_set, batch_size=256, shuffle=False, drop_last=False, num_workers=4) + + +def get_train_images(num): + return torch.stack([train_dataset[i][0] for i in range(num)], dim=0) + + +# %% [markdown] +# ## Building the autoencoder +# +# In general, an autoencoder consists of an **encoder** that maps the input $x$ to a lower-dimensional feature vector $z$, +# and a **decoder** that reconstructs the input $\hat{x}$ from $z$. +# We train the model by comparing $x$ to $\hat{x}$ and optimizing the parameters to increase the similarity between $x$ and $\hat{x}$. +# See below for a small illustration of the autoencoder framework. + +# %% [markdown] +#
+ +# %% [markdown] +# We first start by implementing the encoder. +# The encoder effectively consists of a deep convolutional network, where we scale down the image layer-by-layer using strided convolutions. +# After downscaling the image three times, we flatten the features and apply linear layers. +# The latent representation $z$ is therefore a vector of size *d* which can be flexibly selected. + + +# %% +class Encoder(nn.Module): + def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU): + """Encoder. + + Args: + num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3 + base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it. + latent_dim : Dimensionality of latent representation z + act_fn : Activation function used throughout the encoder network + """ + super().__init__() + c_hid = base_channel_size + self.net = nn.Sequential( + nn.Conv2d(num_input_channels, c_hid, kernel_size=3, padding=1, stride=2), # 32x32 => 16x16 + act_fn(), + nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1), + act_fn(), + nn.Conv2d(c_hid, 2 * c_hid, kernel_size=3, padding=1, stride=2), # 16x16 => 8x8 + act_fn(), + nn.Conv2d(2 * c_hid, 2 * c_hid, kernel_size=3, padding=1), + act_fn(), + nn.Conv2d(2 * c_hid, 2 * c_hid, kernel_size=3, padding=1, stride=2), # 8x8 => 4x4 + act_fn(), + nn.Flatten(), # Image grid to single feature vector + nn.Linear(2 * 16 * c_hid, latent_dim), + ) + + def forward(self, x): + return self.net(x) + + +# %% [markdown] +# Note that we do not apply Batch Normalization here. +# This is because we want the encoding of each image to be independent of all the other images. +# Otherwise, we might introduce correlations into the encoding or decoding that we do not want to have. +# In some implementations, you still can see Batch Normalization being used, because it can also serve as a form of regularization. +# Nevertheless, the better practice is to go with other normalization techniques if necessary like Instance Normalization or Layer Normalization. +# Given the small size of the model, we can neglect normalization for now. + +# %% [markdown] +# The decoder is a mirrored, flipped version of the encoder. +# The only difference is that we replace strided convolutions by transposed convolutions +# (i.e. deconvolutions) to upscale the features. +# Transposed convolutions can be imagined as adding the stride to the input instead of the output, +# and can thus upscale the input. +# For an illustration of a `nn.ConvTranspose2d` layer with kernel size 3, stride 2, and padding 1, +# see below (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/abs/1603.07285)): +# +#
+# +# You see that for an input of size $3\times3$, we obtain an output of $5\times5$. +# However, to truly have a reverse operation of the convolution, +# we need to ensure that the layer scales the input shape by a factor of 2 (e.g. $4\times4\to8\times8$). +# For this, we can specify the parameter `output_padding` which adds additional values to the output shape. +# Note that we do not perform zero-padding with this, but rather increase the output shape for calculation. +# +# Overall, the decoder can be implemented as follows: + + +# %% +class Decoder(nn.Module): + def __init__(self, num_input_channels: int, base_channel_size: int, latent_dim: int, act_fn: object = nn.GELU): + """Decoder. + + Args: + num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3 + base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it. + latent_dim : Dimensionality of latent representation z + act_fn : Activation function used throughout the decoder network + """ + super().__init__() + c_hid = base_channel_size + self.linear = nn.Sequential(nn.Linear(latent_dim, 2 * 16 * c_hid), act_fn()) + self.net = nn.Sequential( + nn.ConvTranspose2d( + 2 * c_hid, 2 * c_hid, kernel_size=3, output_padding=1, padding=1, stride=2 + ), # 4x4 => 8x8 + act_fn(), + nn.Conv2d(2 * c_hid, 2 * c_hid, kernel_size=3, padding=1), + act_fn(), + nn.ConvTranspose2d(2 * c_hid, c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 8x8 => 16x16 + act_fn(), + nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1), + act_fn(), + nn.ConvTranspose2d( + c_hid, num_input_channels, kernel_size=3, output_padding=1, padding=1, stride=2 + ), # 16x16 => 32x32 + nn.Tanh(), # The input images is scaled between -1 and 1, hence the output has to be bounded as well + ) + + def forward(self, x): + x = self.linear(x) + x = x.reshape(x.shape[0], -1, 4, 4) + x = self.net(x) + return x + + +# %% [markdown] +# The encoder and decoder networks we chose here are relatively simple. +# Usually, more complex networks are applied, especially when using a ResNet-based architecture. +# For example, see [VQ-VAE](https://arxiv.org/abs/1711.00937) and +# [NVAE](https://arxiv.org/abs/2007.03898) (although the papers discuss architectures for VAEs, +# they can equally be applied to standard autoencoders). +# +# In a final step, we add the encoder and decoder together into the autoencoder architecture. +# We define the autoencoder as PyTorch Lightning Module to simplify the needed training code: + + +# %% +class Autoencoder(L.LightningModule): + def __init__( + self, + base_channel_size: int, + latent_dim: int, + encoder_class: object = Encoder, + decoder_class: object = Decoder, + num_input_channels: int = 3, + width: int = 32, + height: int = 32, + ): + super().__init__() + # Saving hyperparameters of autoencoder + self.save_hyperparameters() + # Creating encoder and decoder + self.encoder = encoder_class(num_input_channels, base_channel_size, latent_dim) + self.decoder = decoder_class(num_input_channels, base_channel_size, latent_dim) + # Example input array needed for visualizing the graph of the network + self.example_input_array = torch.zeros(2, num_input_channels, width, height) + + def forward(self, x): + """The forward function takes in an image and returns the reconstructed image.""" + z = self.encoder(x) + x_hat = self.decoder(z) + return x_hat + + def _get_reconstruction_loss(self, batch): + """Given a batch of images, this function returns the reconstruction loss (MSE in our case).""" + x, _ = batch # We do not need the labels + x_hat = self.forward(x) + loss = F.mse_loss(x, x_hat, reduction="none") + loss = loss.sum(dim=[1, 2, 3]).mean(dim=[0]) + return loss + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + # Using a scheduler is optional but can be helpful. + # The scheduler reduces the LR if the validation performance hasn't improved for the last N epochs + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, patience=20, min_lr=5e-5) + return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"} + + def training_step(self, batch, batch_idx): + loss = self._get_reconstruction_loss(batch) + self.log("train_loss", loss) + return loss + + def validation_step(self, batch, batch_idx): + loss = self._get_reconstruction_loss(batch) + self.log("val_loss", loss) + + def test_step(self, batch, batch_idx): + loss = self._get_reconstruction_loss(batch) + self.log("test_loss", loss) + + +# %% [markdown] +# For the loss function, we use the mean squared error (MSE). +# The mean squared error pushes the network to pay special attention to those pixel values its estimate is far away. +# Predicting 127 instead of 128 is not important when reconstructing, but confusing 0 with 128 is much worse. +# Note that in contrast to VAEs, we do not predict the probability per pixel value, but instead use a distance measure. +# This saves a lot of parameters and simplifies training. +# To get a better intuition per pixel, we report the summed squared error averaged over the batch dimension +# (any other mean/sum leads to the same result/parameters). +# +# However, MSE has also some considerable disadvantages. +# Usually, MSE leads to blurry images where small noise/high-frequent patterns are removed as those cause a very low error. +# To ensure realistic images to be reconstructed, one could combine Generative Adversarial Networks +# (lecture 10) with autoencoders as done in several works (e.g. see [here](https://arxiv.org/abs/1704.02304), +# [here](https://arxiv.org/abs/1511.05644) or these [slides](http://elarosca.net/slides/iccv_autoencoder_gans.pdf)). +# Additionally, comparing two images using MSE does not necessarily reflect their visual similarity. +# For instance, suppose the autoencoder reconstructs an image shifted by one pixel to the right and bottom. +# Although the images are almost identical, we can get a higher loss than predicting a constant pixel value for half of the image (see code below). +# An example solution for this issue includes using a separate, pre-trained CNN, +# and use a distance of visual features in lower layers as a distance measure instead of the original pixel-level comparison. + + +# %% +def compare_imgs(img1, img2, title_prefix=""): + # Calculate MSE loss between both images + loss = F.mse_loss(img1, img2, reduction="sum") + # Plot images for visual comparison + grid = torchvision.utils.make_grid(torch.stack([img1, img2], dim=0), nrow=2, normalize=True, range=(-1, 1)) + grid = grid.permute(1, 2, 0) + plt.figure(figsize=(4, 2)) + plt.title(f"{title_prefix} Loss: {loss.item():4.2f}") + plt.imshow(grid) + plt.axis("off") + plt.show() + + +for i in range(2): + # Load example image + img, _ = train_dataset[i] + img_mean = img.mean(dim=[1, 2], keepdims=True) + + # Shift image by one pixel + SHIFT = 1 + img_shifted = torch.roll(img, shifts=SHIFT, dims=1) + img_shifted = torch.roll(img_shifted, shifts=SHIFT, dims=2) + img_shifted[:, :1, :] = img_mean + img_shifted[:, :, :1] = img_mean + compare_imgs(img, img_shifted, "Shifted -") + + # Set half of the image to zero + img_masked = img.clone() + img_masked[:, : img_masked.shape[1] // 2, :] = img_mean + compare_imgs(img, img_masked, "Masked -") + +# %% [markdown] +# ### Training the model +# +# During the training, we want to keep track of the learning progress by seeing reconstructions made by our model. +# For this, we implement a callback object in PyTorch Lightning which will add reconstructions every $N$ epochs to our tensorboard: + + +# %% +class GenerateCallback(Callback): + def __init__(self, input_imgs, every_n_epochs=1): + super().__init__() + self.input_imgs = input_imgs # Images to reconstruct during training + # Only save those images every N epochs (otherwise tensorboard gets quite large) + self.every_n_epochs = every_n_epochs + + def on_train_epoch_end(self, trainer, pl_module): + if trainer.current_epoch % self.every_n_epochs == 0: + # Reconstruct images + input_imgs = self.input_imgs.to(pl_module.device) + with torch.no_grad(): + pl_module.eval() + reconst_imgs = pl_module(input_imgs) + pl_module.train() + # Plot and add to tensorboard + imgs = torch.stack([input_imgs, reconst_imgs], dim=1).flatten(0, 1) + grid = torchvision.utils.make_grid(imgs, nrow=2, normalize=True, range=(-1, 1)) + trainer.logger.experiment.add_image("Reconstructions", grid, global_step=trainer.global_step) + + +# %% [markdown] +# We will now write a training function that allows us to train the autoencoder with different latent dimensionality +# and returns both the test and validation score. +# We provide pre-trained models and recommend you using those, especially when you work on a computer without GPU. +# Of course, feel free to train your own models on Lisa. + + +# %% +def train_cifar(latent_dim): + # Create a PyTorch Lightning trainer with the generation callback + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "cifar10_%i" % latent_dim), + accelerator="auto", + devices=1, + max_epochs=500, + callbacks=[ + ModelCheckpoint(save_weights_only=True), + GenerateCallback(get_train_images(8), every_n_epochs=10), + LearningRateMonitor("epoch"), + ], + ) + trainer.logger._log_graph = True # If True, we plot the computation graph in tensorboard + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "cifar10_%i.ckpt" % latent_dim) + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = Autoencoder.load_from_checkpoint(pretrained_filename) + else: + model = Autoencoder(base_channel_size=32, latent_dim=latent_dim) + trainer.fit(model, train_loader, val_loader) + # Test best model on validation and test set + val_result = trainer.test(model, dataloaders=val_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"test": test_result, "val": val_result} + return model, result + + +# %% [markdown] +# ### Comparing latent dimensionality +# +#
+# +# When training an autoencoder, we need to choose a dimensionality for the latent representation $z$. +# The higher the latent dimensionality, the better we expect the reconstruction to be. +# However, the idea of autoencoders is to *compress* data. +# Hence, we are also interested in keeping the dimensionality low. +# To find the best tradeoff, we can train multiple models with different latent dimensionalities. +# The original input has $32\times 32\times 3 = 3072$ pixels. +# Keeping this in mind, a reasonable choice for the latent dimensionality might be between 64 and 384: + +# %% +model_dict = {} +for latent_dim in [64, 128, 256, 384]: + model_ld, result_ld = train_cifar(latent_dim) + model_dict[latent_dim] = {"model": model_ld, "result": result_ld} + +# %% [markdown] +# After training the models, we can plot the reconstruction loss over the latent dimensionality to get an intuition +# how these two properties are correlated: + +# %% +latent_dims = sorted(k for k in model_dict) +val_scores = [model_dict[k]["result"]["val"][0]["test_loss"] for k in latent_dims] + +fig = plt.figure(figsize=(6, 4)) +plt.plot( + latent_dims, val_scores, "--", color="#000", marker="*", markeredgecolor="#000", markerfacecolor="y", markersize=16 +) +plt.xscale("log") +plt.xticks(latent_dims, labels=latent_dims) +plt.title("Reconstruction error over latent dimensionality", fontsize=14) +plt.xlabel("Latent dimensionality") +plt.ylabel("Reconstruction error") +plt.minorticks_off() +plt.ylim(0, 100) +plt.show() + +# %% [markdown] +# As we initially expected, the reconstruction loss goes down with increasing latent dimensionality. +# For our model and setup, the two properties seem to be exponentially (or double exponentially) correlated. +# To understand what these differences in reconstruction error mean, we can visualize example reconstructions of the four models: + + +# %% +def visualize_reconstructions(model, input_imgs): + # Reconstruct images + model.eval() + with torch.no_grad(): + reconst_imgs = model(input_imgs.to(model.device)) + reconst_imgs = reconst_imgs.cpu() + + # Plotting + imgs = torch.stack([input_imgs, reconst_imgs], dim=1).flatten(0, 1) + grid = torchvision.utils.make_grid(imgs, nrow=4, normalize=True, range=(-1, 1)) + grid = grid.permute(1, 2, 0) + plt.figure(figsize=(7, 4.5)) + plt.title("Reconstructed from %i latents" % (model.hparams.latent_dim)) + plt.imshow(grid) + plt.axis("off") + plt.show() + + +# %% +input_imgs = get_train_images(4) +for latent_dim in model_dict: + visualize_reconstructions(model_dict[latent_dim]["model"], input_imgs) + +# %% [markdown] +# Clearly, the smallest latent dimensionality can only save information about the rough shape and color of the object, +# but the reconstructed image is extremely blurry and it is hard to recognize the original object in the reconstruction. +# With 128 features, we can recognize some shapes again although the picture remains blurry. +# The models with the highest two dimensionalities reconstruct the images quite well. +# The difference between 256 and 384 is marginal at first sight but can be noticed when comparing, for instance, +# the backgrounds of the first image (the 384 features model more of the pattern than 256). + +# %% [markdown] +# ### Out-of-distribution images +# +# Before continuing with the applications of autoencoder, we can actually explore some limitations of our autoencoder. +# For example, what happens if we try to reconstruct an image that is clearly out of the distribution of our dataset? +# We expect the decoder to have learned some common patterns in the dataset, +# and thus might in particular fail to reconstruct images that do not follow these patterns. +# +# The first experiment we can try is to reconstruct noise. +# We, therefore, create two images whose pixels are randomly sampled from a uniform distribution over pixel values, +# and visualize the reconstruction of the model (feel free to test different latent dimensionalities): + +# %% +rand_imgs = torch.rand(2, 3, 32, 32) * 2 - 1 +visualize_reconstructions(model_dict[256]["model"], rand_imgs) + +# %% [markdown] +# The reconstruction of the noise is quite poor, and seems to introduce some rough patterns. +# As the input does not follow the patterns of the CIFAR dataset, the model has issues reconstructing it accurately. +# +# We can also check how well the model can reconstruct other manually-coded patterns: + +# %% +plain_imgs = torch.zeros(4, 3, 32, 32) + +# Single color channel +plain_imgs[1, 0] = 1 +# Checkboard pattern +plain_imgs[2, :, :16, :16] = 1 +plain_imgs[2, :, 16:, 16:] = -1 +# Color progression +xx, yy = torch.meshgrid(torch.linspace(-1, 1, 32), torch.linspace(-1, 1, 32)) +plain_imgs[3, 0, :, :] = xx +plain_imgs[3, 1, :, :] = yy + +visualize_reconstructions(model_dict[256]["model"], plain_imgs) + +# %% [markdown] +# The plain, constant images are reconstructed relatively good although the single color channel contains some noticeable noise. +# The hard borders of the checkboard pattern are not as sharp as intended, as well as the color progression, +# both because such patterns never occur in the real-world pictures of CIFAR. +# +# In general, autoencoders tend to fail reconstructing high-frequent noise (i.e. sudden, big changes across few pixels) +# due to the choice of MSE as loss function (see our previous discussion about loss functions in autoencoders). +# Small misalignments in the decoder can lead to huge losses so that the model settles for the expected value/mean in these regions. +# For low-frequent noise, a misalignment of a few pixels does not result in a big difference to the original image. +# However, the larger the latent dimensionality becomes, the more of this high-frequent noise can be accurately reconstructed. + +# %% [markdown] +# ### Generating new images +# +# Variational autoencoders are a generative version of the autoencoders because we regularize the latent space to follow a Gaussian distribution. +# However, in vanilla autoencoders, we do not have any restrictions on the latent vector. +# So what happens if we would actually input a randomly sampled latent vector into the decoder? +# Let's find it out below: + +# %% +model = model_dict[256]["model"] +latent_vectors = torch.randn(8, model.hparams.latent_dim, device=model.device) +with torch.no_grad(): + imgs = model.decoder(latent_vectors) + imgs = imgs.cpu() + +grid = torchvision.utils.make_grid(imgs, nrow=4, normalize=True, range=(-1, 1), pad_value=0.5) +grid = grid.permute(1, 2, 0) +plt.figure(figsize=(8, 5)) +plt.imshow(grid) +plt.axis("off") +plt.show() + +# %% [markdown] +# As we can see, the generated images more look like art than realistic images. +# As the autoencoder was allowed to structure the latent space in whichever way it suits the reconstruction best, +# there is no incentive to map every possible latent vector to realistic images. +# Furthermore, the distribution in latent space is unknown to us and doesn't necessarily follow a multivariate normal distribution. +# Thus, we can conclude that vanilla autoencoders are indeed not generative. + +# %% [markdown] +# ## Finding visually similar images +# +# One application of autoencoders is to build an image-based search engine to retrieve visually similar images. +# This can be done by representing all images as their latent dimensionality, and find the closest $K$ images in this domain. +# The first step to such a search engine is to encode all images into $z$. +# In the following, we will use the training set as a search corpus, and the test set as queries to the system. +# +# (Warning: the following cells can be computationally heavy for a weak CPU-only system. +# If you do not have a strong computer and are not on Google Colab, +# you might want to skip the execution of the following cells and rely on the results shown in the filled notebook) + +# %% +# We use the following model throughout this section. +# If you want to try a different latent dimensionality, change it here! +model = model_dict[128]["model"] + + +# %% +def embed_imgs(model, data_loader): + # Encode all images in the data_laoder using model, and return both images and encodings + img_list, embed_list = [], [] + model.eval() + for imgs, _ in tqdm(data_loader, desc="Encoding images", leave=False): + with torch.no_grad(): + z = model.encoder(imgs.to(model.device)) + img_list.append(imgs) + embed_list.append(z) + return (torch.cat(img_list, dim=0), torch.cat(embed_list, dim=0)) + + +train_img_embeds = embed_imgs(model, train_loader) +test_img_embeds = embed_imgs(model, test_loader) + +# %% [markdown] +# After encoding all images, we just need to write a function that finds the closest $K$ images and returns (or plots) those: + + +# %% +def find_similar_images(query_img, query_z, key_embeds, K=8): + # Find closest K images. We use the euclidean distance here but other like cosine distance can also be used. + dist = torch.cdist(query_z[None, :], key_embeds[1], p=2) + dist = dist.squeeze(dim=0) + dist, indices = torch.sort(dist) + # Plot K closest images + imgs_to_display = torch.cat([query_img[None], key_embeds[0][indices[:K]]], dim=0) + grid = torchvision.utils.make_grid(imgs_to_display, nrow=K + 1, normalize=True, range=(-1, 1)) + grid = grid.permute(1, 2, 0) + plt.figure(figsize=(12, 3)) + plt.imshow(grid) + plt.axis("off") + plt.show() + + +# %% +# Plot the closest images for the first N test images as example +for i in range(8): + find_similar_images(test_img_embeds[0][i], test_img_embeds[1][i], key_embeds=train_img_embeds) + +# %% [markdown] +# Based on our autoencoder, we see that we are able to retrieve many similar images to the test input. +# In particular, in row 4, we can spot that some test images might not be that different +# from the training set as we thought (same poster, just different scaling/color scaling). +# We also see that although we haven't given the model any labels, +# it can cluster different classes in different parts of the latent space (airplane + ship, animals, etc.). +# This is why autoencoders can also be used as a pre-training strategy for deep networks, +# especially when we have a large set of unlabeled images (often the case). +# However, it should be noted that the background still plays a big role in autoencoders while it doesn't for classification. +# Hence, we don't get "perfect" clusters and need to finetune such models for classification. + +# %% [markdown] +# ### Tensorboard clustering +# +# Another way of exploring the similarity of images in the latent space is by dimensionality-reduction methods like PCA or T-SNE. +# Luckily, Tensorboard provides a nice interface for this and we can make use of it in the following: + +# %% +# We use the following model throughout this section. +# If you want to try a different latent dimensionality, change it here! +model = model_dict[128]["model"] + +# %% +# Create a summary writer +writer = SummaryWriter("tensorboard/") + +# %% [markdown] +# The function `add_embedding` allows us to add high-dimensional feature vectors to TensorBoard on which we can perform clustering. +# What we have to provide in the function are the feature vectors, additional metadata such as the labels, +# and the original images so that we can identify a specific image in the clustering. + +# %% +# In case you obtain the following error in the next cell, execute the import statements and last line in this cell +# AttributeError: module 'tensorflow._api.v2.io.gfile' has no attribute 'get_filesystem' + +# import tensorflow as tf +# import tensorboard as tb +# tf.io.gfile = tb.compat.tensorflow_stub.io.gfile + +# %% +# Note: the embedding projector in tensorboard is computationally heavy. +# Reduce the image amount below if your computer struggles with visualizing all 10k points +NUM_IMGS = len(test_set) + +writer.add_embedding( + test_img_embeds[1][:NUM_IMGS], # Encodings per image + metadata=[test_set[i][1] for i in range(NUM_IMGS)], # Adding the labels per image to the plot + label_img=(test_img_embeds[0][:NUM_IMGS] + 1) / 2.0, +) # Adding the original images to the plot + +# %% [markdown] +# Finally, we can run tensorboard to explore similarities among images: + +# %% +# Uncomment the next line to start the tensorboard +# %tensorboard --logdir tensorboard/ + +# %% [markdown] +# You should be able to see something similar as in the following image. +# In case the projector stays empty, try to start the TensorBoard outside of the Jupyter notebook. +# +#
+# +# Overall, we can see that the model indeed clustered images together that are visually similar. +# Especially the background color seems to be a crucial factor in the encoding. +# This correlates to the chosen loss function, here Mean Squared Error on pixel-level +# because the background is responsible for more than half of the pixels in an average image. +# Hence, the model learns to focus on it. +# Nevertheless, we can see that the encodings also separate a couple of classes in the latent space although it hasn't seen any labels. +# This shows again that autoencoding can also be used as a "pre-training"/transfer learning task before classification. + +# %% +# Closing the summary writer +writer.close() + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have implemented our own autoencoder on small RGB images and explored various properties of the model. +# In contrast to variational autoencoders, vanilla AEs are not generative and can work on MSE loss functions. +# This makes them often easier to train. +# Both versions of AE can be used for dimensionality reduction, as we have seen for finding visually similar images beyond pixel distances. +# Despite autoencoders gaining less interest in the research community due to their more "theoretically" +# challenging counterpart of VAEs, autoencoders still find usage in a lot of applications like denoising and compression. +# Hence, AEs are an essential tool that every Deep Learning engineer/researcher should be familiar with. diff --git a/course_UvA-DL/08-deep-autoencoders/autoencoder_visualization.svg b/course_UvA-DL/08-deep-autoencoders/autoencoder_visualization.svg new file mode 100644 index 0000000..f6c0411 --- /dev/null +++ b/course_UvA-DL/08-deep-autoencoders/autoencoder_visualization.svg @@ -0,0 +1,3 @@ + + +
Encoder
Encoder
Decoder
Decoder
Input 𝑥
Input 𝑥
𝑧
𝑧
Reconstruction 𝑥
Reconstruction 𝑥
ˆ
ˆ
diff --git a/course_UvA-DL/08-deep-autoencoders/deconvolution.gif b/course_UvA-DL/08-deep-autoencoders/deconvolution.gif new file mode 100644 index 0000000..894c1d6 Binary files /dev/null and b/course_UvA-DL/08-deep-autoencoders/deconvolution.gif differ diff --git a/course_UvA-DL/08-deep-autoencoders/tensorboard_projector_screenshot.jpeg b/course_UvA-DL/08-deep-autoencoders/tensorboard_projector_screenshot.jpeg new file mode 100644 index 0000000..c638110 Binary files /dev/null and b/course_UvA-DL/08-deep-autoencoders/tensorboard_projector_screenshot.jpeg differ diff --git a/course_UvA-DL/09-normalizing-flows/.meta.yml b/course_UvA-DL/09-normalizing-flows/.meta.yml new file mode 100644 index 0000000..d366a99 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/.meta.yml @@ -0,0 +1,31 @@ +title: "Tutorial 9: Normalizing Flows for Image Modeling" +author: Phillip Lippe +created: 2021-06-07 +updated: 2023-03-14 +license: CC BY-SA +build: 0 +tags: + - Image +description: | + In this tutorial, we will take a closer look at complex, deep normalizing flows. + The most popular, current application of deep normalizing flows is to model datasets of images. + As for other generative models, images are a good domain to start working on because + (1) CNNs are widely studied and strong models exist, + (2) images are high-dimensional and complex, + and (3) images are discrete integers. + In this tutorial, we will review current advances in normalizing flows for image modeling, + and get hands-on experience on coding normalizing flows. + Note that normalizing flows are commonly parameter heavy and therefore computationally expensive. + We will use relatively simple and shallow flows to save computational cost and allow you to run the notebook on CPU, + but keep in mind that a simple way to improve the scores of the flows we study here is to make them deeper. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - tabulate + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/09-normalizing-flows/.thumb.jpg b/course_UvA-DL/09-normalizing-flows/.thumb.jpg new file mode 100644 index 0000000..9654f8a Binary files /dev/null and b/course_UvA-DL/09-normalizing-flows/.thumb.jpg differ diff --git a/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py new file mode 100644 index 0000000..4462290 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/NF_image_modeling.py @@ -0,0 +1,1423 @@ +# %% [markdown] +#
+# Throughout this notebook, we make use of [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/). +# The first cell imports our usual libraries. + +# %% +import math +import os +import time +import urllib.request +from urllib.error import HTTPError + +import lightning as L +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns +import tabulate +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from IPython.display import HTML, display +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from matplotlib.colors import to_rgb +from torch import Tensor +from torchvision import transforms +from torchvision.datasets import MNIST +from tqdm.notebook import tqdm + +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.reset_orig() + +# Path to the folder where the datasets are/should be downloaded (e.g. MNIST) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/tutorial11") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +# Fetching the device that will be used throughout this notebook +device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0") +print("Using device", device) + +# %% [markdown] +# Again, we have a few pretrained models. We download them below to the specified path above. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial11/" +# Files to download +pretrained_files = ["MNISTFlow_simple.ckpt", "MNISTFlow_vardeq.ckpt", "MNISTFlow_multiscale.ckpt"] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# We will use the MNIST dataset in this notebook. +# MNIST constitutes, despite its simplicity, a challenge for small generative models as it requires the global understanding of an image. +# At the same time, we can easily judge whether generated images come from the same distribution as the dataset +# (i.e. represent real digits), or not. +# +# To deal better with the discrete nature of the images, we transform them +# from a range of 0-1 to a range of 0-255 as integers. + + +# %% +# Convert images from 0-1 to 0-255 (integers) +def discretize(sample): + return (sample * 255).to(torch.int32) + + +# Transformations applied on each image => make them a tensor and discretize +transform = transforms.Compose([transforms.ToTensor(), discretize]) + +# Loading the training dataset. We need to split it into a training and validation part +train_dataset = MNIST(root=DATASET_PATH, train=True, transform=transform, download=True) +L.seed_everything(42) +train_set, val_set = torch.utils.data.random_split(train_dataset, [50000, 10000]) + +# Loading the test set +test_set = MNIST(root=DATASET_PATH, train=False, transform=transform, download=True) + +# We define a set of data loaders that we can use for various purposes later. +# Note that for actually training a model, we will use different data loaders +# with a lower batch size. +train_loader = data.DataLoader(train_set, batch_size=256, shuffle=False, drop_last=False) +val_loader = data.DataLoader(val_set, batch_size=64, shuffle=False, drop_last=False, num_workers=4) +test_loader = data.DataLoader(test_set, batch_size=64, shuffle=False, drop_last=False, num_workers=4) + +# %% [markdown] +# In addition, we will define below a function to simplify the visualization of images/samples. +# Some training examples of the MNIST dataset is shown below. + + +# %% +def show_imgs(imgs, title=None, row_size=4): + # Form a grid of pictures (we use max. 8 columns) + num_imgs = imgs.shape[0] if isinstance(imgs, Tensor) else len(imgs) + is_int = imgs.dtype == torch.int32 if isinstance(imgs, Tensor) else imgs[0].dtype == torch.int32 + nrow = min(num_imgs, row_size) + ncol = int(math.ceil(num_imgs / nrow)) + imgs = torchvision.utils.make_grid(imgs, nrow=nrow, pad_value=128 if is_int else 0.5) + np_imgs = imgs.cpu().numpy() + # Plot the grid + plt.figure(figsize=(1.5 * nrow, 1.5 * ncol)) + plt.imshow(np.transpose(np_imgs, (1, 2, 0)), interpolation="nearest") + plt.axis("off") + if title is not None: + plt.title(title) + plt.show() + plt.close() + + +show_imgs([train_set[i][0] for i in range(8)]) + +# %% [markdown] +# ## Normalizing Flows as generative model +# +# In the previous lectures, we have seen Energy-based models, Variational Autoencoders (VAEs) +# and Generative Adversarial Networks (GANs) as example of generative models. +# However, none of them explicitly learn the probability density function $p(x)$ of the real input data. +# While VAEs model a lower bound, energy-based models only implicitly learn the probability density. +# GANs on the other hand provide us a sampling mechanism for generating new data, without offering a likelihood estimate. +# The generative model we will look at here, called Normalizing Flows, actually models the true data distribution +# $p(x)$ and provides us with an exact likelihood estimate. +# Below, we can visually compare VAEs, GANs and Flows +# (figure credit - [Lilian Weng](https://lilianweng.github.io/lil-log/2018/10/13/flow-based-deep-generative-models.html)): +# +#
+# +# The major difference compared to VAEs is that flows use *invertible* functions $f$ +# to map the input data $x$ to a latent representation $z$. +# To realize this, $z$ must be of the same shape as $x$. +# This is in contrast to VAEs where $z$ is usually much lower dimensional than the original input data. +# However, an invertible mapping also means that for every data point $x$, we have a corresponding latent representation +# $z$ which allows us to perform lossless reconstruction ($z$ to $x$). +# In the visualization above, this means that $x=x'$ for flows, no matter what invertible function $f$ and input $x$ we choose. +# +# Nonetheless, how are normalizing flows modeling a probability density with an invertible function? +# The answer to this question is the rule for change of variables. +# Specifically, given a prior density $p_z(z)$ (e.g. Gaussian) and an invertible function $f$, +# we can determine $p_x(x)$ as follows: +# +# $$ +# \begin{split} +# \int p_x(x) dx & = \int p_z(z) dz = 1 \hspace{1cm}\text{(by definition of a probability distribution)}\\ +# \Leftrightarrow p_x(x) & = p_z(z) \left|\frac{dz}{dx}\right| = p_z(f(x)) \left|\frac{df(x)}{dx}\right| +# \end{split} +# $$ +# +# Hence, in order to determine the probability of $x$, we only need to determine its probability in latent space, +# and get the derivate of $f$. +# Note that this is for a univariate distribution, and $f$ is required to be invertible and smooth. +# For a multivariate case, the derivative becomes a Jacobian of which we need to take the determinant. +# As we usually use the log-likelihood as objective, we write the multivariate term with logarithms below: +# +# $$ +# \log p_x(\mathbf{x}) = \log p_z(f(\mathbf{x})) + \log{} \left|\det \frac{df(\mathbf{x})}{d\mathbf{x}}\right| +# $$ +# +# Although we now know how a normalizing flow obtains its likelihood, it might not be clear what a normalizing flow does intuitively. +# For this, we should look from the inverse perspective of the flow starting with the prior probability density $p_z(z)$. +# If we apply an invertible function on it, we effectively "transform" its probability density. +# For instance, if $f^{-1}(z)=z+1$, we shift the density by one while still remaining a valid probability distribution, +# and being invertible. +# We can also apply more complex transformations, like scaling: $f^{-1}(z)=2z+1$, but there you might see a difference. +# When you scale, you also change the volume of the probability density, as for example on uniform distributions +# (figure credit - [Eric Jang](https://blog.evjang.com/2018/01/nf1.html)): +# +#
+# +# You can see that the height of $p(y)$ should be lower than $p(x)$ after scaling. +# This change in volume represents $\left|\frac{df(x)}{dx}\right|$ in our equation above, +# and ensures that even after scaling, we still have a valid probability distribution. +# We can go on with making our function $f$ more complex. +# However, the more complex $f$ becomes, the harder it will be to find the inverse $f^{-1}$ of it, +# and to calculate the log-determinant of the Jacobian $\log{} \left|\det \frac{df(\mathbf{x})}{d\mathbf{x}}\right|$. +# An easier trick to stack multiple invertible functions $f_{1,...,K}$ after each other, as all together, +# they still represent a single, invertible function. +# Using multiple, learnable invertible functions, a normalizing flow attempts to transform +# $p_z(z)$ slowly into a more complex distribution which should finally be $p_x(x)$. +# We visualize the idea below +# (figure credit - [Lilian Weng](https://lilianweng.github.io/lil-log/2018/10/13/flow-based-deep-generative-models.html)): +# +#
+# +# Starting from $z_0$, which follows the prior Gaussian distribution, we sequentially apply the invertible +# functions $f_1,f_2,...,f_K$, until $z_K$ represents $x$. +# Note that in the figure above, the functions $f$ represent the inverted function from $f$ we had above +# (here: $f:Z\to X$, above: $f:X\to Z$). +# This is just a different notation and has no impact on the actual flow design because all $f$ need to be invertible anyways. +# When we estimate the log likelihood of a data point $x$ as in the equations above, +# we run the flows in the opposite direction than visualized above. +# Multiple flow layers have been proposed that use a neural network as learnable parameters, +# such as the planar and radial flow. +# However, we will focus here on flows that are commonly used in image +# modeling, and will discuss them in the rest of the notebook along with +# the details of how to train a normalizing flow. + +# %% [markdown] +# ## Normalizing Flows on images +# +#
+# +# To become familiar with normalizing flows, especially for the application of image modeling, +# it is best to discuss the different elements in a flow along with the implementation. +# As a general concept, we want to build a normalizing flow that maps an input image (here MNIST) to an equally sized latent space: +# +#
+# +# As a first step, we will implement a template of a normalizing flow in PyTorch Lightning. +# During training and validation, a normalizing flow performs density estimation in the forward direction. +# For this, we apply a series of flow transformations on the input $x$ and estimate the probability +# of the input by determining the probability of the transformed point $z$ given a prior, +# and the change of volume caused by the transformations. +# During inference, we can do both density estimation and sampling new points by inverting the flow transformations. +# Therefore, we define a function `_get_likelihood` which performs density estimation, +# and `sample` to generate new examples. +# The functions `training_step`, `validation_step` and `test_step` all make use of `_get_likelihood`. +# +# The standard metric used in generative models, and in particular normalizing flows, is bits per dimensions (bpd). +# Bpd is motivated from an information theory perspective and describes how many bits we would need to encode a particular example in our modeled distribution. +# The less bits we need, the more likely the example in our distribution. +# When we test for the bits per dimension of our test dataset, we can judge whether our model generalizes to new samples of the dataset and didn't memorize the training dataset. +# In order to calculate the bits per dimension score, we can rely on the negative log-likelihood and change the log base (as bits are binary while NLL is usually exponential): +# +# $$\text{bpd} = \text{nll} \cdot \log_2\left(\exp(1)\right) \cdot \left(\prod d_i\right)^{-1}$$ +# +# where $d_1,...,d_K$ are the dimensions of the input. +# For images, this would be the height, width and channel number. +# We divide the log likelihood by these extra dimensions to have a metric which we can compare for different image resolutions. +# In the original image space, MNIST examples have a bits per dimension +# score of 8 (we need 8 bits to encode each pixel as there are 256 +# possible values). + + +# %% +class ImageFlow(L.LightningModule): + def __init__(self, flows, import_samples=8): + """ImageFlow. + + Args: + flows: A list of flows (each a nn.Module) that should be applied on the images. + import_samples: Number of importance samples to use during testing (see explanation below). Can be changed at any time + """ + super().__init__() + self.flows = nn.ModuleList(flows) + self.import_samples = import_samples + # Create prior distribution for final latent space + self.prior = torch.distributions.normal.Normal(loc=0.0, scale=1.0) + # Example input for visualizing the graph + self.example_input_array = train_set[0][0].unsqueeze(dim=0) + + def forward(self, imgs): + # The forward function is only used for visualizing the graph + return self._get_likelihood(imgs) + + def encode(self, imgs): + # Given a batch of images, return the latent representation z and ldj of the transformations + z, ldj = imgs, torch.zeros(imgs.shape[0], device=self.device) + for flow in self.flows: + z, ldj = flow(z, ldj, reverse=False) + return z, ldj + + def _get_likelihood(self, imgs, return_ll=False): + """Given a batch of images, return the likelihood of those. + + If return_ll is True, this function returns the log likelihood of the input. Otherwise, the ouptut metric is + bits per dimension (scaled negative log likelihood) + """ + z, ldj = self.encode(imgs) + log_pz = self.prior.log_prob(z).sum(dim=[1, 2, 3]) + log_px = ldj + log_pz + nll = -log_px + # Calculating bits per dimension + bpd = nll * np.log2(np.exp(1)) / np.prod(imgs.shape[1:]) + return bpd.mean() if not return_ll else log_px + + @torch.no_grad() + def sample(self, img_shape, z_init=None): + """Sample a batch of images from the flow.""" + # Sample latent representation from prior + if z_init is None: + z = self.prior.sample(sample_shape=img_shape).to(device) + else: + z = z_init.to(device) + + # Transform z to x by inverting the flows + ldj = torch.zeros(img_shape[0], device=device) + for flow in reversed(self.flows): + z, ldj = flow(z, ldj, reverse=True) + return z + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + # An scheduler is optional, but can help in flows to get the last bpd improvement + scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99) + return [optimizer], [scheduler] + + def training_step(self, batch, batch_idx): + # Normalizing flows are trained by maximum likelihood => return bpd + loss = self._get_likelihood(batch[0]) + self.log("train_bpd", loss) + return loss + + def validation_step(self, batch, batch_idx): + loss = self._get_likelihood(batch[0]) + self.log("val_bpd", loss) + + def test_step(self, batch, batch_idx): + # Perform importance sampling during testing => estimate likelihood M times for each image + samples = [] + for _ in range(self.import_samples): + img_ll = self._get_likelihood(batch[0], return_ll=True) + samples.append(img_ll) + img_ll = torch.stack(samples, dim=-1) + + # To average the probabilities, we need to go from log-space to exp, and back to log. + # Logsumexp provides us a stable implementation for this + img_ll = torch.logsumexp(img_ll, dim=-1) - np.log(self.import_samples) + + # Calculate final bpd + bpd = -img_ll * np.log2(np.exp(1)) / np.prod(batch[0].shape[1:]) + bpd = bpd.mean() + + self.log("test_bpd", bpd) + + +# %% [markdown] +# The `test_step` function differs from the training and validation step in that it makes use of importance sampling. +# We will discuss the motiviation and details behind this after +# understanding how flows model discrete images in continuous space. + +# %% [markdown] +# ### Dequantization +# +# Normalizing flows rely on the rule of change of variables, which is naturally defined in continuous space. +# Applying flows directly on discrete data leads to undesired density models where arbitrarly high likelihood are placed on a few, particular values. +# See the illustration below: +# +#
+# +# The black points represent the discrete points, and the green volume the density modeled by a normalizing flow in continuous space. +# The flow would continue to increase the likelihood for $x=0,1,2,3$ while having no volume on any other point. +# Remember that in continuous space, we have the constraint that the overall volume of the probability density must be 1 ($\int p(x)dx=1$). +# Otherwise, we don't model a probability distribution anymore. +# However, the discrete points $x=0,1,2,3$ represent delta peaks with no width in continuous space. +# This is why the flow can place an infinite high likelihood on these few points while still representing a distribution in continuous space. +# Nonetheless, the learned density does not tell us anything about the distribution among the discrete points, +# as in discrete space, the likelihoods of those four points would have to sum to 1, not to infinity. +# +# To prevent such degenerated solutions, a common solution is to add a small amount of noise to each discrete value, which is also referred to as dequantization. +# Considering $x$ as an integer (as it is the case for images), the dequantized representation $v$ can be formulated as $v=x+u$ where $u\in[0,1)^D$. +# Thus, the discrete value $1$ is modeled by a distribution over the interval $[1.0, 2.0)$, the value $2$ by an volume over $[2.0, 3.0)$, etc. +# Our objective of modeling $p(x)$ becomes: +# +# $$ p(x) = \int p(x+u)du = \int \frac{q(u|x)}{q(u|x)}p(x+u)du = \mathbb{E}_{u\sim q(u|x)}\left[\frac{p(x+u)}{q(u|x)} \right]$$ +# +# with $q(u|x)$ being the noise distribution. +# For now, we assume it to be uniform, which can also be written as $p(x)=\mathbb{E}_{u\sim U(0,1)^D}\left[p(x+u) \right]$. +# +# In the following, we will implement Dequantization as a flow transformation itself. +# After adding noise to the discrete values, we additionally transform the volume into a Gaussian-like shape. +# This is done by scaling $x+u$ between $0$ and $1$, and applying the invert of the sigmoid function $\sigma(z)^{-1} = \log z - \log 1-z$. +# If we would not do this, we would face two problems: +# +# 1. +# The input is scaled between 0 and 256 while the prior distribution is a Gaussian with mean $0$ and standard deviation $1$. +# In the first iterations after initializing the parameters of the flow, we would have extremely low likelihoods for large values like $256$. +# This would cause the training to diverge instantaneously. +# 2. +# As the output distribution is a Gaussian, it is beneficial for the flow to have a similarly shaped input distribution. +# This will reduce the modeling complexity that is required by the flow. +# +# Overall, we can implement dequantization as follows: + + +# %% +class Dequantization(nn.Module): + def __init__(self, alpha=1e-5, quants=256): + """Dequantization. + + Args: + alpha: small constant that is used to scale the original input. + Prevents dealing with values very close to 0 and 1 when inverting the sigmoid + quants: Number of possible discrete values (usually 256 for 8-bit image) + """ + super().__init__() + self.alpha = alpha + self.quants = quants + + def forward(self, z, ldj, reverse=False): + if not reverse: + z, ldj = self.dequant(z, ldj) + z, ldj = self.sigmoid(z, ldj, reverse=True) + else: + z, ldj = self.sigmoid(z, ldj, reverse=False) + z = z * self.quants + ldj += np.log(self.quants) * np.prod(z.shape[1:]) + z = torch.floor(z).clamp(min=0, max=self.quants - 1).to(torch.int32) + return z, ldj + + def sigmoid(self, z, ldj, reverse=False): + # Applies an invertible sigmoid transformation + if not reverse: + ldj += (-z - 2 * F.softplus(-z)).sum(dim=[1, 2, 3]) + z = torch.sigmoid(z) + else: + z = z * (1 - self.alpha) + 0.5 * self.alpha # Scale to prevent boundaries 0 and 1 + ldj += np.log(1 - self.alpha) * np.prod(z.shape[1:]) + ldj += (-torch.log(z) - torch.log(1 - z)).sum(dim=[1, 2, 3]) + z = torch.log(z) - torch.log(1 - z) + return z, ldj + + def dequant(self, z, ldj): + # Transform discrete values to continuous volumes + z = z.to(torch.float32) + z = z + torch.rand_like(z).detach() + z = z / self.quants + ldj -= np.log(self.quants) * np.prod(z.shape[1:]) + return z, ldj + + +# %% [markdown] +# A good check whether a flow is correctly implemented or not, is to verify that it is invertible. +# Hence, we will dequantize a randomly chosen training image, and then quantize it again. +# We would expect that we would get the exact same image out: + +# %% +# Testing invertibility of dequantization layer +L.seed_everything(42) +orig_img = train_set[0][0].unsqueeze(dim=0) +ldj = torch.zeros( + 1, +) +dequant_module = Dequantization() +deq_img, ldj = dequant_module(orig_img, ldj, reverse=False) +reconst_img, ldj = dequant_module(deq_img, ldj, reverse=True) + +d1, d2 = torch.where(orig_img.squeeze() != reconst_img.squeeze()) +if len(d1) != 0: + print("Dequantization was not invertible.") + for i in range(d1.shape[0]): + print("Original value:", orig_img[0, 0, d1[i], d2[i]].item()) + print("Reconstructed value:", reconst_img[0, 0, d1[i], d2[i]].item()) +else: + print("Successfully inverted dequantization") + +# Layer is not strictly invertible due to float precision constraints +# assert (orig_img == reconst_img).all().item() + +# %% [markdown] +# In contrast to our expectation, the test fails. +# However, this is no reason to doubt our implementation here as only one single value is not equal to the original. +# This is caused due to numerical inaccuracies in the sigmoid invert. +# While the input space to the inverted sigmoid is scaled between 0 and 1, the output space is between $-\infty$ and $\infty$. +# And as we use 32 bits to represent the numbers (in addition to applying logs over and over again), +# such inaccuries can occur and should not be worrisome. +# Nevertheless, it is good to be aware of them, and can be improved by using a double tensor (float64). +# +# Finally, we can take our dequantization and actually visualize the +# distribution it transforms the discrete values into: + +# %% + + +def visualize_dequantization(quants, prior=None): + """Function for visualizing the dequantization values of discrete values in continuous space.""" + # Prior over discrete values. If not given, a uniform is assumed + if prior is None: + prior = np.ones(quants, dtype=np.float32) / quants + prior = prior / prior.sum() # Ensure proper categorical distribution + + inp = torch.arange(-4, 4, 0.01).view(-1, 1, 1, 1) # Possible continuous values we want to consider + ldj = torch.zeros(inp.shape[0]) + dequant_module = Dequantization(quants=quants) + # Invert dequantization on continuous values to find corresponding discrete value + out, ldj = dequant_module.forward(inp, ldj, reverse=True) + inp, out, prob = inp.squeeze().numpy(), out.squeeze().numpy(), ldj.exp().numpy() + prob = prob * prior[out] # Probability scaled by categorical prior + + # Plot volumes and continuous distribution + sns.set_style("white") + _ = plt.figure(figsize=(6, 3)) + x_ticks = [] + for v in np.unique(out): + indices = np.where(out == v) + color = to_rgb("C%i" % v) + plt.fill_between(inp[indices], prob[indices], np.zeros(indices[0].shape[0]), color=color + (0.5,), label=str(v)) + plt.plot([inp[indices[0][0]]] * 2, [0, prob[indices[0][0]]], color=color) + plt.plot([inp[indices[0][-1]]] * 2, [0, prob[indices[0][-1]]], color=color) + x_ticks.append(inp[indices[0][0]]) + x_ticks.append(inp.max()) + plt.xticks(x_ticks, ["%.1f" % x for x in x_ticks]) + plt.plot(inp, prob, color=(0.0, 0.0, 0.0)) + # Set final plot properties + plt.ylim(0, prob.max() * 1.1) + plt.xlim(inp.min(), inp.max()) + plt.xlabel("z") + plt.ylabel("Probability") + plt.title("Dequantization distribution for %i discrete values" % quants) + plt.legend() + plt.show() + plt.close() + + +visualize_dequantization(quants=8) + +# %% [markdown] +# The visualized distribution show the sub-volumes that are assigned to the different discrete values. +# The value $0$ has its volume between $[-\infty, -1.9)$, the value $1$ is represented by the interval $[-1.9, -1.1)$, etc. +# The volume for each discrete value has the same probability mass. +# That's why the volumes close to the center (e.g. 3 and 4) have a smaller area on the z-axis as others +# ($z$ is being used to denote the output of the whole dequantization flow). +# +# Effectively, the consecutive normalizing flow models discrete images by the following objective: +# +# $$\log p(x) = \log \mathbb{E}_{u\sim q(u|x)}\left[\frac{p(x+u)}{q(u|x)} \right] \geq \mathbb{E}_{u}\left[\log \frac{p(x+u)}{q(u|x)} \right]$$ +# +# Although normalizing flows are exact in likelihood, we have a lower bound. +# Specifically, this is an example of the Jensen inequality because we need to move the log into the expectation so we can use Monte-carlo estimates. +# In general, this bound is considerably smaller than the ELBO in variational autoencoders. +# Actually, we can reduce the bound ourselves by estimating the expectation not by one, but by $M$ samples. +# In other words, we can apply importance sampling which leads to the following inequality: +# +# $$\log p(x) = \log \mathbb{E}_{u\sim q(u|x)}\left[\frac{p(x+u)}{q(u|x)} \right] \geq \mathbb{E}_{u}\left[\log \frac{1}{M} \sum_{m=1}^{M} \frac{p(x+u_m)}{q(u_m|x)} \right] \geq \mathbb{E}_{u}\left[\log \frac{p(x+u)}{q(u|x)} \right]$$ +# +# The importance sampling $\frac{1}{M} \sum_{m=1}^{M} \frac{p(x+u_m)}{q(u_m|x)}$ becomes +# $\mathbb{E}_{u\sim q(u|x)}\left[\frac{p(x+u)}{q(u|x)} \right]$ if $M\to \infty$, +# so that the more samples we use, the tighter the bound is. +# During testing, we can make use of this property and have it implemented in `test_step` in `ImageFlow`. +# In theory, we could also use this tighter bound during training. +# However, related work has shown that this does not necessarily lead to +# an improvement given the additional computational cost, and it is more +# efficient to stick with a single estimate [5]. + +# %% [markdown] +# ### Variational Dequantization +# +# Dequantization uses a uniform distribution for the noise $u$ which effectively leads to images being represented as hypercubes +# (cube in high dimensions) with sharp borders. +# However, modeling such sharp borders is not easy for a flow as it uses smooth transformations to convert it into a Gaussian distribution. +# +# Another way of looking at it is if we change the prior distribution in the previous visualization. +# Imagine we have independent Gaussian noise on pixels which is commonly the case for any real-world taken picture. +# Therefore, the flow would have to model a distribution as above, but with the individual volumes scaled as follows: + +# %% +visualize_dequantization(quants=8, prior=np.array([0.075, 0.2, 0.4, 0.2, 0.075, 0.025, 0.0125, 0.0125])) + +# %% [markdown] +# Transforming such a probability into a Gaussian is a difficult task, especially with such hard borders. +# Dequantization has therefore been extended to more sophisticated, learnable distributions beyond uniform in a variational framework. +# In particular, if we remember the learning objective +# $\log p(x) = \log \mathbb{E}_{u}\left[\frac{p(x+u)}{q(u|x)} \right]$, +# the uniform distribution can be replaced by a learned distribution $q_{\theta}(u|x)$ with support over $u\in[0,1)^D$. +# This approach is called Variational Dequantization and has been proposed by Ho et al. +# [3]. +# How can we learn such a distribution? +# We can use a second normalizing flow that takes $x$ as external input and learns a flexible distribution over $u$. +# To ensure a support over $[0,1)^D$, we can apply a sigmoid activation function as final flow transformation. +# +# Inheriting the original dequantization class, we can implement variational dequantization as follows: + + +# %% +class VariationalDequantization(Dequantization): + def __init__(self, var_flows, alpha=1e-5): + """Variational Dequantization. + + Args: + var_flows: A list of flow transformations to use for modeling q(u|x) + alpha: Small constant, see Dequantization for details + """ + super().__init__(alpha=alpha) + self.flows = nn.ModuleList(var_flows) + + def dequant(self, z, ldj): + z = z.to(torch.float32) + img = (z / 255.0) * 2 - 1 # We condition the flows on x, i.e. the original image + + # Prior of u is a uniform distribution as before + # As most flow transformations are defined on [-infinity,+infinity], we apply an inverse sigmoid first. + deq_noise = torch.rand_like(z).detach() + deq_noise, ldj = self.sigmoid(deq_noise, ldj, reverse=True) + for flow in self.flows: + deq_noise, ldj = flow(deq_noise, ldj, reverse=False, orig_img=img) + deq_noise, ldj = self.sigmoid(deq_noise, ldj, reverse=False) + + # After the flows, apply u as in standard dequantization + z = (z + deq_noise) / 256.0 + ldj -= np.log(256.0) * np.prod(z.shape[1:]) + return z, ldj + + +# %% [markdown] +# Variational dequantization can be used as a substitute for dequantization. +# We will compare dequantization and variational dequantization in later experiments. + +# %% [markdown] +# ### Coupling layers +# +#
+# +# Next, we look at possible transformations to apply inside the flow. +# A recent popular flow layer, which works well in combination with deep neural networks, +# is the coupling layer introduced by Dinh et al. +# [1]. +# The input $z$ is arbitrarily split into two parts, $z_{1:j}$ and $z_{j+1:d}$, of which the first remains unchanged by the flow. +# Yet, $z_{1:j}$ is used to parameterize the transformation for the second part, $z_{j+1:d}$. +# Various transformations have been proposed in recent time [3,4], but here we will settle for the simplest and most efficient one: affine coupling. +# In this coupling layer, we apply an affine transformation by shifting the input by a bias $\mu$ and scale it by $\sigma$. +# In other words, our transformation looks as follows: +# +# $$z'_{j+1:d} = \mu_{\theta}(z_{1:j}) + \sigma_{\theta}(z_{1:j}) \odot z_{j+1:d}$$ +# +# The functions $\mu$ and $\sigma$ are implemented as a shared neural network, +# and the sum and multiplication are performed element-wise. +# The LDJ is thereby the sum of the logs of the scaling factors: $\sum_i \left[\log \sigma_{\theta}(z_{1:j})\right]_i$. +# Inverting the layer can as simply be done as subtracting the bias and dividing by the scale: +# +# $$z_{j+1:d} = \left(z'_{j+1:d} - \mu_{\theta}(z_{1:j})\right) / \sigma_{\theta}(z_{1:j})$$ +# +# We can also visualize the coupling layer in form of a computation graph, +# where $z_1$ represents $z_{1:j}$, and $z_2$ represents $z_{j+1:d}$: +# +#
+# +# In our implementation, we will realize the splitting of variables as masking. +# The variables to be transformed, $z_{j+1:d}$, are masked when passing $z$ to the shared network to predict the transformation parameters. +# When applying the transformation, we mask the parameters for $z_{1:j}$ +# so that we have an identity operation for those variables: + + +# %% +class CouplingLayer(nn.Module): + def __init__(self, network, mask, c_in): + """Coupling layer inside a normalizing flow. + + Args: + network: A PyTorch nn.Module constituting the deep neural network for mu and sigma. + Output shape should be twice the channel size as the input. + mask: Binary mask (0 or 1) where 0 denotes that the element should be transformed, + while 1 means the latent will be used as input to the NN. + c_in: Number of input channels + """ + super().__init__() + self.network = network + self.scaling_factor = nn.Parameter(torch.zeros(c_in)) + # Register mask as buffer as it is a tensor which is not a parameter, + # but should be part of the modules state. + self.register_buffer("mask", mask) + + def forward(self, z, ldj, reverse=False, orig_img=None): + """Forward. + + Args: + z: Latent input to the flow + ldj: + The current ldj of the previous flows. The ldj of this layer will be added to this tensor. + reverse: If True, we apply the inverse of the layer. + orig_img: + Only needed in VarDeq. Allows external input to condition the flow on (e.g. original image) + """ + # Apply network to masked input + z_in = z * self.mask + if orig_img is None: + nn_out = self.network(z_in) + else: + nn_out = self.network(torch.cat([z_in, orig_img], dim=1)) + s, t = nn_out.chunk(2, dim=1) + + # Stabilize scaling output + s_fac = self.scaling_factor.exp().view(1, -1, 1, 1) + s = torch.tanh(s / s_fac) * s_fac + + # Mask outputs (only transform the second part) + s = s * (1 - self.mask) + t = t * (1 - self.mask) + + # Affine transformation + if not reverse: + # Whether we first shift and then scale, or the other way round, + # is a design choice, and usually does not have a big impact + z = (z + t) * torch.exp(s) + ldj += s.sum(dim=[1, 2, 3]) + else: + z = (z * torch.exp(-s)) - t + ldj -= s.sum(dim=[1, 2, 3]) + + return z, ldj + + +# %% [markdown] +# For stabilization purposes, we apply a $\tanh$ activation function on the scaling output. +# This prevents sudden large output values for the scaling that can destabilize training. +# To still allow scaling factors smaller or larger than -1 and 1 respectively, +# we have a learnable parameter per dimension, called `scaling_factor`. +# This scales the tanh to different limits. +# Below, we visualize the effect of the scaling factor on the output activation of the scaling terms: + +# %% +with torch.no_grad(): + x = torch.arange(-5, 5, 0.01) + scaling_factors = [0.5, 1, 2] + sns.set() + fig, ax = plt.subplots(1, 3, figsize=(12, 3)) + for i, scale in enumerate(scaling_factors): + y = torch.tanh(x / scale) * scale + ax[i].plot(x.numpy(), y.numpy()) + ax[i].set_title("Scaling factor: " + str(scale)) + ax[i].set_ylim(-3, 3) + plt.subplots_adjust(wspace=0.4) + sns.reset_orig() + plt.show() + +# %% [markdown] +# Coupling layers generalize to any masking technique we could think of. +# However, the most common approach for images is to split the input $z$ in half, using a checkerboard mask or channel mask. +# A checkerboard mask splits the variables across the height and width dimensions and assigns each other pixel to $z_{j+1:d}$. +# Thereby, the mask is shared across channels. +# In contrast, the channel mask assigns half of the channels to $z_{j+1:d}$, and the other half to $z_{1:j+1}$. +# Note that when we apply multiple coupling layers, we invert the masking for each other layer so that each variable is transformed a similar amount of times. +# +# Let's implement a function that creates a checkerboard mask and a channel mask for us: + + +# %% +def create_checkerboard_mask(h, w, invert=False): + x, y = torch.arange(h, dtype=torch.int32), torch.arange(w, dtype=torch.int32) + xx, yy = torch.meshgrid(x, y) + mask = torch.fmod(xx + yy, 2) + mask = mask.to(torch.float32).view(1, 1, h, w) + if invert: + mask = 1 - mask + return mask + + +def create_channel_mask(c_in, invert=False): + mask = torch.cat([torch.ones(c_in // 2, dtype=torch.float32), torch.zeros(c_in - c_in // 2, dtype=torch.float32)]) + mask = mask.view(1, c_in, 1, 1) + if invert: + mask = 1 - mask + return mask + + +# %% [markdown] +# We can also visualize the corresponding masks for an image of size $8\times 8\times 2$ (2 channels): + +# %% +checkerboard_mask = create_checkerboard_mask(h=8, w=8).expand(-1, 2, -1, -1) +channel_mask = create_channel_mask(c_in=2).expand(-1, -1, 8, 8) + +show_imgs(checkerboard_mask.transpose(0, 1), "Checkerboard mask") +show_imgs(channel_mask.transpose(0, 1), "Channel mask") + +# %% [markdown] +# As a last aspect of coupling layers, we need to decide for the deep neural network we want to apply in the coupling layers. +# The input to the layers is an image, and hence we stick with a CNN. +# Because the input to a transformation depends on all transformations before, +# it is crucial to ensure a good gradient flow through the CNN back to the input, +# which can be optimally achieved by a ResNet-like architecture. +# Specifically, we use a Gated ResNet that adds a $\sigma$-gate to the skip connection, +# similarly to the input gate in LSTMs. +# The details are not necessarily important here, and the network is +# strongly inspired from Flow++ [3] in case you are interested in building +# even stronger models. + + +# %% +class ConcatELU(nn.Module): + """Activation function that applies ELU in both direction (inverted and plain). + + Allows non-linearity while providing strong gradients for any input (important for final convolution) + """ + + def forward(self, x): + return torch.cat([F.elu(x), F.elu(-x)], dim=1) + + +class LayerNormChannels(nn.Module): + def __init__(self, c_in, eps=1e-5): + """This module applies layer norm across channels in an image. + + Args: + c_in: Number of channels of the input + eps: Small constant to stabilize std + """ + super().__init__() + self.gamma = nn.Parameter(torch.ones(1, c_in, 1, 1)) + self.beta = nn.Parameter(torch.zeros(1, c_in, 1, 1)) + self.eps = eps + + def forward(self, x): + mean = x.mean(dim=1, keepdim=True) + var = x.var(dim=1, unbiased=False, keepdim=True) + y = (x - mean) / torch.sqrt(var + self.eps) + y = y * self.gamma + self.beta + return y + + +class GatedConv(nn.Module): + def __init__(self, c_in, c_hidden): + """This module applies a two-layer convolutional ResNet block with input gate. + + Args: + c_in: Number of channels of the input + c_hidden: Number of hidden dimensions we want to model (usually similar to c_in) + """ + super().__init__() + self.net = nn.Sequential( + ConcatELU(), + nn.Conv2d(2 * c_in, c_hidden, kernel_size=3, padding=1), + ConcatELU(), + nn.Conv2d(2 * c_hidden, 2 * c_in, kernel_size=1), + ) + + def forward(self, x): + out = self.net(x) + val, gate = out.chunk(2, dim=1) + return x + val * torch.sigmoid(gate) + + +class GatedConvNet(nn.Module): + def __init__(self, c_in, c_hidden=32, c_out=-1, num_layers=3): + """Module that summarizes the previous blocks to a full convolutional neural network. + + Args: + c_in: Number of input channels + c_hidden: Number of hidden dimensions to use within the network + c_out: Number of output channels. If -1, 2 times the input channels are used (affine coupling) + num_layers: Number of gated ResNet blocks to apply + """ + super().__init__() + c_out = c_out if c_out > 0 else 2 * c_in + layers = [] + layers += [nn.Conv2d(c_in, c_hidden, kernel_size=3, padding=1)] + for layer_index in range(num_layers): + layers += [GatedConv(c_hidden, c_hidden), LayerNormChannels(c_hidden)] + layers += [ConcatELU(), nn.Conv2d(2 * c_hidden, c_out, kernel_size=3, padding=1)] + self.nn = nn.Sequential(*layers) + + self.nn[-1].weight.data.zero_() + self.nn[-1].bias.data.zero_() + + def forward(self, x): + return self.nn(x) + + +# %% [markdown] +# ### Training loop +# +# Finally, we can add Dequantization, Variational Dequantization and Coupling Layers together to build our full normalizing flow on MNIST images. +# We apply 8 coupling layers in the main flow, and 4 for variational dequantization if applied. +# We apply a checkerboard mask throughout the network as with a single channel (black-white images), +# we cannot apply channel mask. +# The overall architecture is visualized below. +# +# +#
+ + +# %% +def create_simple_flow(use_vardeq=True): + flow_layers = [] + if use_vardeq: + vardeq_layers = [ + CouplingLayer( + network=GatedConvNet(c_in=2, c_out=2, c_hidden=16), + mask=create_checkerboard_mask(h=28, w=28, invert=(i % 2 == 1)), + c_in=1, + ) + for i in range(4) + ] + flow_layers += [VariationalDequantization(var_flows=vardeq_layers)] + else: + flow_layers += [Dequantization()] + + for i in range(8): + flow_layers += [ + CouplingLayer( + network=GatedConvNet(c_in=1, c_hidden=32), + mask=create_checkerboard_mask(h=28, w=28, invert=(i % 2 == 1)), + c_in=1, + ) + ] + + flow_model = ImageFlow(flow_layers).to(device) + return flow_model + + +# %% [markdown] +# For implementing the training loop, we use the framework of PyTorch Lightning and reduce the code overhead. +# If interested, you can take a look at the generated tensorboard file, +# in particularly the graph to see an overview of flow transformations that are applied. +# Note that we again provide pre-trained models (see later on in the notebook) +# as normalizing flows are particularly expensive to train. +# We have also run validation and testing as this can take some time as well with the added importance sampling. + + +# %% +def train_flow(flow, model_name="MNISTFlow"): + # Create a PyTorch Lightning trainer + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, model_name), + accelerator="auto", + devices=1, + max_epochs=200, + gradient_clip_val=1.0, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="min", monitor="val_bpd"), + LearningRateMonitor("epoch"), + ], + ) + trainer.logger._log_graph = True + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + train_data_loader = data.DataLoader( + train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=8 + ) + result = None + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, model_name + ".ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + ckpt = torch.load(pretrained_filename, map_location=device) + flow.load_state_dict(ckpt["state_dict"]) + result = ckpt.get("result", None) + else: + print("Start training", model_name) + trainer.fit(flow, train_data_loader, val_loader) + + # Test best model on validation and test set if no result has been found + # Testing can be expensive due to the importance sampling. + if result is None: + val_result = trainer.test(flow, dataloaders=val_loader, verbose=False) + start_time = time.time() + test_result = trainer.test(flow, dataloaders=test_loader, verbose=False) + duration = time.time() - start_time + result = {"test": test_result, "val": val_result, "time": duration / len(test_loader) / flow.import_samples} + + return flow, result + + +# %% [markdown] +# ## Multi-scale architecture +# +#
+# +# One disadvantage of normalizing flows is that they operate on the exact same dimensions as the input. +# If the input is high-dimensional, so is the latent space, which requires larger computational cost to learn suitable transformations. +# However, particularly in the image domain, many pixels contain less information in the sense +# that we could remove them without loosing the semantical information of the image. +# +# Based on this intuition, deep normalizing flows on images commonly apply a multi-scale architecture [1]. +# After the first $N$ flow transformations, we split off half of the latent dimensions and directly evaluate them on the prior. +# The other half is run through $N$ more flow transformations, and depending on the size of the input, +# we split it again in half or stop overall at this position. +# The two operations involved in this setup is `Squeeze` and `Split` which +# we will review more closely and implement below. + +# %% [markdown] +# ### Squeeze and Split +# +# When we want to remove half of the pixels in an image, we have the problem of deciding which variables to cut, +# and how to rearrange the image. +# Thus, the squeezing operation is commonly used before split, which divides the image into subsquares +# of shape $2\times 2\times C$, and reshapes them into $1\times 1\times 4C$ blocks. +# Effectively, we reduce the height and width of the image by a factor of 2 while scaling the number of channels by 4. +# Afterwards, we can perform the split operation over channels without the need of rearranging the pixels. +# The smaller scale also makes the overall architecture more efficient. +# Visually, the squeeze operation should transform the input as follows: +# +#
+# +# The input of $4\times 4\times 1$ is scaled to $2\times 2\times 4$ following +# the idea of grouping the pixels in $2\times 2\times 1$ subsquares. +# Next, let's try to implement this layer: + + +# %% +class SqueezeFlow(nn.Module): + def forward(self, z, ldj, reverse=False): + B, C, H, W = z.shape + if not reverse: + # Forward direction: H x W x C => H/2 x W/2 x 4C + z = z.reshape(B, C, H // 2, 2, W // 2, 2) + z = z.permute(0, 1, 3, 5, 2, 4) + z = z.reshape(B, 4 * C, H // 2, W // 2) + else: + # Reverse direction: H/2 x W/2 x 4C => H x W x C + z = z.reshape(B, C // 4, 2, 2, H, W) + z = z.permute(0, 1, 4, 2, 5, 3) + z = z.reshape(B, C // 4, H * 2, W * 2) + return z, ldj + + +# %% [markdown] +# Before moving on, we can verify our implementation by comparing our output with the example figure above: + +# %% +sq_flow = SqueezeFlow() +rand_img = torch.arange(1, 17).view(1, 1, 4, 4) +print("Image (before)\n", rand_img) +forward_img, _ = sq_flow(rand_img, ldj=None, reverse=False) +print("\nImage (forward)\n", forward_img.permute(0, 2, 3, 1)) # Permute for readability +reconst_img, _ = sq_flow(forward_img, ldj=None, reverse=True) +print("\nImage (reverse)\n", reconst_img) + +# %% [markdown] +# The split operation divides the input into two parts, and evaluates one part directly on the prior. +# So that our flow operation fits to the implementation of the previous layers, +# we will return the prior probability of the first part as the log determinant jacobian of the layer. +# It has the same effect as if we would combine all variable splits at the +# end of the flow, and evaluate them together on the prior. + + +# %% +class SplitFlow(nn.Module): + def __init__(self): + super().__init__() + self.prior = torch.distributions.normal.Normal(loc=0.0, scale=1.0) + + def forward(self, z, ldj, reverse=False): + if not reverse: + z, z_split = z.chunk(2, dim=1) + ldj += self.prior.log_prob(z_split).sum(dim=[1, 2, 3]) + else: + z_split = self.prior.sample(sample_shape=z.shape).to(device) + z = torch.cat([z, z_split], dim=1) + ldj -= self.prior.log_prob(z_split).sum(dim=[1, 2, 3]) + return z, ldj + + +# %% [markdown] +# ### Building a multi-scale flow +# +# After defining the squeeze and split operation, we are finally able to build our own multi-scale flow. +# Deep normalizing flows such as Glow and Flow++ [2,3] often apply a split operation directly after squeezing. +# However, with shallow flows, we need to be more thoughtful about where to place the split operation as we need at least a minimum amount of transformations on each variable. +# Our setup is inspired by the original RealNVP architecture [1] which is shallower than other, +# more recent state-of-the-art architectures. +# +# Hence, for the MNIST dataset, we will apply the first squeeze operation after two coupling layers, but don't apply a split operation yet. +# Because we have only used two coupling layers and each the variable has been only transformed once, a split operation would be too early. +# We apply two more coupling layers before finally applying a split flow and squeeze again. +# The last four coupling layers operate on a scale of $7\times 7\times 8$. +# The full flow architecture is shown below. +# +#
+# +# Note that while the feature maps inside the coupling layers reduce with the height and width of the input, +# the increased number of channels is not directly considered. +# To counteract this, we increase the hidden dimensions for the coupling layers on the squeezed input. +# The dimensions are often scaled by 2 as this approximately increases the computation cost by 4 canceling with the squeezing operation. +# However, we will choose the hidden dimensionalities $32, 48, 64$ for the +# three scales respectively to keep the number of parameters reasonable +# and show the efficiency of multi-scale architectures. + + +# %% +def create_multiscale_flow(): + flow_layers = [] + + vardeq_layers = [ + CouplingLayer( + network=GatedConvNet(c_in=2, c_out=2, c_hidden=16), + mask=create_checkerboard_mask(h=28, w=28, invert=(i % 2 == 1)), + c_in=1, + ) + for i in range(4) + ] + flow_layers += [VariationalDequantization(vardeq_layers)] + + flow_layers += [ + CouplingLayer( + network=GatedConvNet(c_in=1, c_hidden=32), + mask=create_checkerboard_mask(h=28, w=28, invert=(i % 2 == 1)), + c_in=1, + ) + for i in range(2) + ] + flow_layers += [SqueezeFlow()] + for i in range(2): + flow_layers += [ + CouplingLayer( + network=GatedConvNet(c_in=4, c_hidden=48), mask=create_channel_mask(c_in=4, invert=(i % 2 == 1)), c_in=4 + ) + ] + flow_layers += [SplitFlow(), SqueezeFlow()] + for i in range(4): + flow_layers += [ + CouplingLayer( + network=GatedConvNet(c_in=8, c_hidden=64), mask=create_channel_mask(c_in=8, invert=(i % 2 == 1)), c_in=8 + ) + ] + + flow_model = ImageFlow(flow_layers).to(device) + return flow_model + + +# %% [markdown] +# We can show the difference in number of parameters below: + + +# %% +def print_num_params(model): + num_params = sum(np.prod(p.shape) for p in model.parameters()) + print(f"Number of parameters: {num_params:,}") + + +print_num_params(create_simple_flow(use_vardeq=False)) +print_num_params(create_simple_flow(use_vardeq=True)) +print_num_params(create_multiscale_flow()) + +# %% [markdown] +# Although the multi-scale flow has almost 3 times the parameters of the single scale flow, +# it is not necessarily more computationally expensive than its counterpart. +# We will compare the runtime in the following experiments as well. + +# %% [markdown] +# ## Analysing the flows +# +# In the last part of the notebook, we will train all the models we have implemented above, +# and try to analyze the effect of the multi-scale architecture and variational dequantization. +# +# ### Training flow variants +# +# Before we can analyse the flow models, we need to train them first. +# We provide pre-trained models that contain the validation and test performance, and run-time information. +# As flow models are computationally expensive, we advice you to rely on +# those pretrained models for a first run through the notebook. + +# %% +flow_dict = {"simple": {}, "vardeq": {}, "multiscale": {}} +flow_dict["simple"]["model"], flow_dict["simple"]["result"] = train_flow( + create_simple_flow(use_vardeq=False), model_name="MNISTFlow_simple" +) +flow_dict["vardeq"]["model"], flow_dict["vardeq"]["result"] = train_flow( + create_simple_flow(use_vardeq=True), model_name="MNISTFlow_vardeq" +) +flow_dict["multiscale"]["model"], flow_dict["multiscale"]["result"] = train_flow( + create_multiscale_flow(), model_name="MNISTFlow_multiscale" +) + +# %% [markdown] +# ### Density modeling and sampling +# +# Firstly, we can compare the models on their quantitative results. +# The following table shows all important statistics. +# The inference time specifies the time needed to determine the +# probability for a batch of 64 images for each model, and the sampling +# time the duration it took to sample a batch of 64 images. + +# %% language="html" +# +# + +# %% + +table = [ + [ + key, + "%4.3f bpd" % flow_dict[key]["result"]["val"][0]["test_bpd"], + "%4.3f bpd" % flow_dict[key]["result"]["test"][0]["test_bpd"], + "%2.0f ms" % (1000 * flow_dict[key]["result"]["time"]), + "%2.0f ms" % (1000 * flow_dict[key]["result"].get("samp_time", 0)), + "{:,}".format(sum(np.prod(p.shape) for p in flow_dict[key]["model"].parameters())), + ] + for key in flow_dict +] +display( + HTML( + tabulate.tabulate( + table, + tablefmt="html", + headers=["Model", "Validation Bpd", "Test Bpd", "Inference time", "Sampling time", "Num Parameters"], + ) + ) +) + +# %% [markdown] +# As we have intially expected, using variational dequantization improves upon standard dequantization in terms of bits per dimension. +# Although the difference with 0.04bpd doesn't seem impressive first, it is a considerably step for generative models +# (most state-of-the-art models improve upon previous models in a range of 0.02-0.1bpd on CIFAR with three times as high bpd). +# While it takes longer to evaluate the probability of an image due to the variational dequantization, +# which also leads to a longer training time, it does not have an effect on the sampling time. +# This is because inverting variational dequantization is the same as dequantization: finding the next lower integer. +# +# When we compare the two models to multi-scale architecture, we can see that the bits per dimension score again dropped by about 0.04bpd. +# Additionally, the inference time and sampling time improved notably despite having more parameters. +# Thus, we see that the multi-scale flow is not only stronger for density modeling, but also more efficient. +# +# Next, we can test the sampling quality of the models. +# We should note that the samples for variational dequantization and standard dequantization are very similar, +# and hence we visualize here only the ones for variational dequantization and the multi-scale model. +# However, feel free to also test out the `"simple"` model. +# The seeds are set to obtain reproducable generations and are not cherry picked. + +# %% +L.seed_everything(44) +samples = flow_dict["vardeq"]["model"].sample(img_shape=[16, 1, 28, 28]) +show_imgs(samples.cpu()) + +# %% +L.seed_everything(44) +samples = flow_dict["multiscale"]["model"].sample(img_shape=[16, 8, 7, 7]) +show_imgs(samples.cpu()) + +# %% [markdown] +# From the few samples, we can see a clear difference between the simple and the multi-scale model. +# The single-scale model has only learned local, small correlations while the multi-scale model was able to learn full, +# global relations that form digits. +# This show-cases another benefit of the multi-scale model. +# In contrast to VAEs, the outputs are sharp as normalizing flows can naturally model complex, +# multi-modal distributions while VAEs have the independent decoder output noise. +# Nevertheless, the samples from this flow are far from perfect as not all samples show true digits. + +# %% [markdown] +# ### Interpolation in latent space +# +# Another popular test for the smoothness of the latent space of generative models is to interpolate between two training examples. +# As normalizing flows are strictly invertible, we can guarantee that any image is represented in the latent space. +# We again compare the variational dequantization model with the multi-scale model below. + + +# %% +@torch.no_grad() +def interpolate(model, img1, img2, num_steps=8): + """Interpolate. + + Args: + model: object of ImageFlow class that represents the (trained) flow model + img1, img2: Image tensors of shape [1, 28, 28]. Images between which should be interpolated. + num_steps: Number of interpolation steps. 8 interpolation steps mean 6 intermediate pictures besides img1 and img2 + """ + imgs = torch.stack([img1, img2], dim=0).to(model.device) + z, _ = model.encode(imgs) + alpha = torch.linspace(0, 1, steps=num_steps, device=z.device).view(-1, 1, 1, 1) + interpolations = z[0:1] * alpha + z[1:2] * (1 - alpha) + interp_imgs = model.sample(interpolations.shape[:1] + imgs.shape[1:], z_init=interpolations) + show_imgs(interp_imgs, row_size=8) + + +exmp_imgs, _ = next(iter(train_loader)) + +# %% +L.seed_everything(42) +for i in range(2): + interpolate(flow_dict["vardeq"]["model"], exmp_imgs[2 * i], exmp_imgs[2 * i + 1]) + +# %% +L.seed_everything(42) +for i in range(2): + interpolate(flow_dict["multiscale"]["model"], exmp_imgs[2 * i], exmp_imgs[2 * i + 1]) + +# %% [markdown] +# The interpolations of the multi-scale model result in more realistic digits +# (first row $7\leftrightarrow 8\leftrightarrow 6$, second row $9\leftrightarrow 4\leftrightarrow 6$), +# while the variational dequantization model focuses on local patterns that globally do not form a digit. +# For the multi-scale model, we actually did not do the "true" interpolation between the two images +# as we did not consider the variables that were split along the flow (they have been sampled randomly for all samples). +# However, as we will see in the next experiment, the early variables do not effect the overall image much. + +# %% [markdown] +# ### Visualization of latents in different levels of multi-scale +# +# In the following we will focus more on the multi-scale flow. +# We want to analyse what information is being stored in the variables split at early layers, +# and what information for the final variables. +# For this, we sample 8 images where each of them share the same final latent variables, +# but differ in the other part of the latent variables. +# Below we visualize three examples of this: + +# %% +L.seed_everything(44) +for _ in range(3): + z_init = flow_dict["multiscale"]["model"].prior.sample(sample_shape=[1, 8, 7, 7]) + z_init = z_init.expand(8, -1, -1, -1) + samples = flow_dict["multiscale"]["model"].sample(img_shape=z_init.shape, z_init=z_init) + show_imgs(samples.cpu()) + +# %% [markdown] +# We see that the early split variables indeed have a smaller effect on the image. +# Still, small differences can be spot when we look carefully at the borders of the digits. +# For instance, the hole at the top of the 8 changes for different samples although all of them represent the same coarse structure. +# This shows that the flow indeed learns to separate the higher-level +# information in the final variables, while the early split ones contain +# local noise patterns. + +# %% [markdown] +# ### Visualizing Dequantization +# +# As a final part of this notebook, we will look at the effect of variational dequantization. +# We have motivated variational dequantization by the issue of sharp edges/boarders being difficult to model, +# and a flow would rather prefer smooth, prior-like distributions. +# To check how what noise distribution $q(u|x)$ the flows in the +# variational dequantization module have learned, we can plot a histogram +# of output values from the dequantization and variational dequantization +# module. + + +# %% +def visualize_dequant_distribution(model: ImageFlow, imgs: Tensor, title: str = None): + """Visualize dequant distribution. + + Args: + model: The flow of which we want to visualize the dequantization distribution + imgs: Example training images of which we want to visualize the dequantization distribution + """ + imgs = imgs.to(device) + ldj = torch.zeros(imgs.shape[0], dtype=torch.float32).to(device) + with torch.no_grad(): + dequant_vals = [] + for _ in tqdm(range(8), leave=False): + d, _ = model.flows[0](imgs, ldj, reverse=False) + dequant_vals.append(d) + dequant_vals = torch.cat(dequant_vals, dim=0) + dequant_vals = dequant_vals.view(-1).cpu().numpy() + sns.set() + plt.figure(figsize=(10, 3)) + plt.hist(dequant_vals, bins=256, color=to_rgb("C0") + (0.5,), edgecolor="C0", density=True) + if title is not None: + plt.title(title) + plt.show() + plt.close() + + +sample_imgs, _ = next(iter(train_loader)) + +# %% +visualize_dequant_distribution(flow_dict["simple"]["model"], sample_imgs, title="Dequantization") + +# %% +visualize_dequant_distribution(flow_dict["vardeq"]["model"], sample_imgs, title="Variational dequantization") + +# %% [markdown] +# The dequantization distribution in the first plot shows that the MNIST images have a strong bias towards 0 (black), +# and the distribution of them have a sharp border as mentioned before. +# The variational dequantization module has indeed learned a much smoother distribution with a Gaussian-like curve which can be modeled much better. +# For the other values, we would need to visualize the distribution $q(u|x)$ on a deeper level, depending on $x$. +# However, as all $u$'s interact and depend on each other, we would need +# to visualize a distribution in 784 dimensions, which is not that +# intuitive anymore. + +# %% [markdown] +# ## Conclusion +# +# In conclusion, we have seen how to implement our own normalizing flow, and what difficulties arise if we want to apply them on images. +# Dequantization is a crucial step in mapping the discrete images into continuous space to prevent underisable delta-peak solutions. +# While dequantization creates hypercubes with hard border, variational dequantization allows us to fit a flow much better on the data. +# This allows us to obtain a lower bits per dimension score, while not affecting the sampling speed. +# The most common flow element, the coupling layer, is simple to implement, and yet effective. +# Furthermore, multi-scale architectures help to capture the global image context while allowing us to efficiently scale up the flow. +# Normalizing flows are an interesting alternative to VAEs as they allow an exact likelihood estimate in continuous space, +# and we have the guarantee that every possible input $x$ has a corresponding latent vector $z$. +# However, even beyond continuous inputs and images, flows can be applied and allow us to exploit +# the data structure in latent space, as e.g. on graphs for the task of molecule generation [6]. +# Recent advances in [Neural ODEs](https://arxiv.org/pdf/1806.07366.pdf) allow a flow with infinite number of layers, +# called Continuous Normalizing Flows, whose potential is yet to fully explore. +# Overall, normalizing flows are an exciting research area which will continue over the next couple of years. + +# %% [markdown] +# ## References +# +# [1] Dinh, L., Sohl-Dickstein, J., and Bengio, S. (2017). +# “Density estimation using Real NVP,” In: 5th International Conference on Learning Representations, ICLR 2017. +# [Link](https://arxiv.org/abs/1605.08803) +# +# [2] Kingma, D. P., and Dhariwal, P. (2018). +# “Glow: Generative Flow with Invertible 1x1 Convolutions,” In: Advances in Neural Information Processing Systems, vol. +# 31, pp. +# 10215--10224. +# [Link](http://papers.nips.cc/paper/8224-glow-generative-flow-with-invertible-1x1-convolutions.pdf) +# +# [3] Ho, J., Chen, X., Srinivas, A., Duan, Y., and Abbeel, P. (2019). +# “Flow++: Improving Flow-Based Generative Models with Variational Dequantization and Architecture Design,” +# in Proceedings of the 36th International Conference on Machine Learning, vol. +# 97, pp. +# 2722–2730. +# [Link](https://arxiv.org/abs/1902.00275) +# +# [4] Durkan, C., Bekasov, A., Murray, I., and Papamakarios, G. (2019). +# “Neural Spline Flows,” In: Advances in Neural Information Processing Systems, pp. +# 7509–7520. +# [Link](http://papers.neurips.cc/paper/8969-neural-spline-flows.pdf) +# +# [5] Hoogeboom, E., Cohen, T. S., and Tomczak, J. M. (2020). +# “Learning Discrete Distributions by Dequantization,” arXiv preprint arXiv2001.11235v1. +# [Link](https://arxiv.org/abs/2001.11235) +# +# [6] Lippe, P., and Gavves, E. (2021). +# “Categorical Normalizing Flows via Continuous Transformations,” +# In: International Conference on Learning Representations, ICLR 2021. +# [Link](https://openreview.net/pdf?id=-GLNZeVDuik) diff --git a/course_UvA-DL/09-normalizing-flows/Squeeze_operation.svg b/course_UvA-DL/09-normalizing-flows/Squeeze_operation.svg new file mode 100644 index 0000000..cf66772 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/Squeeze_operation.svg @@ -0,0 +1,3 @@ + + +
16
16
15
15
12
12
14
14
13
13
10
10
8
8
7
7
4
4
6
6
5
5
1
1
2
2
3
3
4
4
5
5
6
6
7
7
8
8
9
9
10
10
11
11
12
12
13
13
14
14
15
15
16
16
2
2
3
3
9
9
11
11
1
1
H x W x C
H x W x C
H/2 x W/2 x 4C
H/2 x W/2 x 4C
diff --git a/course_UvA-DL/09-normalizing-flows/comparison_GAN_VAE_NF.png b/course_UvA-DL/09-normalizing-flows/comparison_GAN_VAE_NF.png new file mode 100644 index 0000000..fffe9d2 Binary files /dev/null and b/course_UvA-DL/09-normalizing-flows/comparison_GAN_VAE_NF.png differ diff --git a/course_UvA-DL/09-normalizing-flows/coupling_flow.svg b/course_UvA-DL/09-normalizing-flows/coupling_flow.svg new file mode 100644 index 0000000..817758c --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/coupling_flow.svg @@ -0,0 +1,926 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/09-normalizing-flows/dequantization_issue.svg b/course_UvA-DL/09-normalizing-flows/dequantization_issue.svg new file mode 100644 index 0000000..f254085 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/dequantization_issue.svg @@ -0,0 +1,417 @@ + + + + + + + + 2020-09-10T14:01:01.731212 + image/svg+xml + + + Matplotlib v3.3.0, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/09-normalizing-flows/image_to_gaussian.svg b/course_UvA-DL/09-normalizing-flows/image_to_gaussian.svg new file mode 100644 index 0000000..f287768 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/image_to_gaussian.svg @@ -0,0 +1,3 @@ + + +
p(z)
p(z)
1x28x28
1x28x28
p(x)
p(x)
1x28x28
1x28x28
Normalizing Flow
Normalizing F...
diff --git a/course_UvA-DL/09-normalizing-flows/multiscale_flow.svg b/course_UvA-DL/09-normalizing-flows/multiscale_flow.svg new file mode 100644 index 0000000..db7af24 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/multiscale_flow.svg @@ -0,0 +1,3 @@ + + +
(Variational) Dequantization
(Variational) Dequantizati...
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
p(x)
p(x)
p(z)
p(z)
1x28x28
1x28x28
1x28x28
1x28x28
Squeeze flow
Squeeze flow
1x28x28
=>
 4x14x14
1x28x28...
Squeeze flow
Squeeze flow
Split flow
Split flow
2x14x14
2x14x14
2x14x14
=>
8x7x7
2x14x14...
Concatenate and reshape
Concatenate and reshape
diff --git a/course_UvA-DL/09-normalizing-flows/normalizing_flow_layout.png b/course_UvA-DL/09-normalizing-flows/normalizing_flow_layout.png new file mode 100644 index 0000000..9a164b9 Binary files /dev/null and b/course_UvA-DL/09-normalizing-flows/normalizing_flow_layout.png differ diff --git a/course_UvA-DL/09-normalizing-flows/uniform_flow.png b/course_UvA-DL/09-normalizing-flows/uniform_flow.png new file mode 100644 index 0000000..34a08cf Binary files /dev/null and b/course_UvA-DL/09-normalizing-flows/uniform_flow.png differ diff --git a/course_UvA-DL/09-normalizing-flows/vanilla_flow.svg b/course_UvA-DL/09-normalizing-flows/vanilla_flow.svg new file mode 100644 index 0000000..6220752 --- /dev/null +++ b/course_UvA-DL/09-normalizing-flows/vanilla_flow.svg @@ -0,0 +1,3 @@ + + +
(Variational) Dequantization
(Variational) Dequantizati...
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
Coupling layer
p(x)
p(x)
p(z)
p(z)
1x28x28
1x28x28
1x28x28
1x28x28
diff --git a/course_UvA-DL/10-autoregressive-image-modeling/.meta.yml b/course_UvA-DL/10-autoregressive-image-modeling/.meta.yml new file mode 100644 index 0000000..ac181f2 --- /dev/null +++ b/course_UvA-DL/10-autoregressive-image-modeling/.meta.yml @@ -0,0 +1,23 @@ +title: "Tutorial 10: Autoregressive Image Modeling" +author: Phillip Lippe +created: 2021-07-12 +updated: 2023-03-14 +license: CC BY-SA +build: 0 +tags: + - Image +description: | + In this tutorial, we implement an autoregressive likelihood model for the task of image modeling. + Autoregressive models are naturally strong generative models that constitute one of the current + state-of-the-art architectures on likelihood-based image modeling, + and are also the basis for large language generation models such as GPT3. + We will focus on the PixelCNN architecture in this tutorial, and apply it to MNIST modeling. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - GPU diff --git a/course_UvA-DL/10-autoregressive-image-modeling/.thumb.jpg b/course_UvA-DL/10-autoregressive-image-modeling/.thumb.jpg new file mode 100644 index 0000000..1ad5d61 Binary files /dev/null and b/course_UvA-DL/10-autoregressive-image-modeling/.thumb.jpg differ diff --git a/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py b/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py new file mode 100644 index 0000000..5adb928 --- /dev/null +++ b/course_UvA-DL/10-autoregressive-image-modeling/Autoregressive_Image_Modeling.py @@ -0,0 +1,977 @@ +# %% [markdown] +#
+# +# Similar to the language generation you have seen in assignment 2, autoregressive models work on images by modeling the likelihood of a pixel given all previous ones. +# For instance, in the picture below, we model the pixel $x_i$ as a conditional probability distribution +# based on all previous (here blue) pixels (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/abs/1601.06759)): +# +#
+# +# Generally, autoregressive model over high-dimensional data $\mathbf{x}$ factor the joint distribution as the following product of conditionals: +# +# $$p(\mathbf{x})=p(x_1, ..., x_n)=\prod_{i=1}^{n} p(x_i|x_1,...,x_{i-1})$$ +# +# Learning these conditionals is often much simpler than learning the joint distribution $p(\mathbf{x})$ all together. +# However, disadvantages of autoregressive models include slow sampling, especially for large images, +# as we need height-times-width forward passes through the model. +# In addition, for some applications, we require a latent space as modeled in VAEs and Normalizing Flows. +# For instance, in autoregressive models, we cannot interpolate between two images because of the lack of a latent representation. +# We will explore and discuss these benefits and drawbacks alongside with our implementation. +# +# Our implementation will focus on the [PixelCNN](https://arxiv.org/pdf/1606.05328.pdf) [2] model which has been discussed in detail in the lecture. +# Most current SOTA models use PixelCNN as their fundamental architecture, +# and various additions have been proposed to improve the performance +# (e.g. [PixelCNN++](https://arxiv.org/pdf/1701.05517.pdf) and [PixelSNAIL](http://proceedings.mlr.press/v80/chen18h/chen18h.pdf)). +# Hence, implementing PixelCNN is a good starting point for our short tutorial. +# +# First of all, we need to import our standard libraries. Similarly as in +# the last couple of tutorials, we will use [PyTorch +# Lightning](https://lightning.ai/docs/pytorch/stable/) here as +# well. + +# %% + +import math +import os +import urllib.request +from urllib.error import HTTPError + +import lightning as L + +# Imports for plotting +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from matplotlib.colors import to_rgb +from torch import Tensor +from torchvision import transforms +from torchvision.datasets import MNIST +from tqdm.notebook import tqdm + +plt.set_cmap("cividis") +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export + +# Path to the folder where the datasets are/should be downloaded (e.g. MNIST) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/tutorial12") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.determinstic = True +torch.backends.cudnn.benchmark = False + +# Fetching the device that will be used throughout this notebook +device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0") +print("Using device", device) + +# %% [markdown] +# We again provide a pretrained model, which is downloaded below: + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial12/" +# Files to download +pretrained_files = ["PixelCNN.ckpt"] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# Similar to the Normalizing Flows in Tutorial 11, we will work on the +# MNIST dataset and use 8-bits per pixel (values between 0 and 255). The +# dataset is loaded below: + + +# %% +# Convert images from 0-1 to 0-255 (integers). We use the long datatype as we will use the images as labels as well +def discretize(sample): + return (sample * 255).to(torch.long) + + +# Transformations applied on each image => only make them a tensor +transform = transforms.Compose([transforms.ToTensor(), discretize]) + +# Loading the training dataset. We need to split it into a training and validation part +train_dataset = MNIST(root=DATASET_PATH, train=True, transform=transform, download=True) +L.seed_everything(42) +train_set, val_set = torch.utils.data.random_split(train_dataset, [50000, 10000]) + +# Loading the test set +test_set = MNIST(root=DATASET_PATH, train=False, transform=transform, download=True) + +# We define a set of data loaders that we can use for various purposes later. +train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=4) +val_loader = data.DataLoader(val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) +test_loader = data.DataLoader(test_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) + +# %% [markdown] +# A good practice is to always visualize some data examples to get an intuition of the data: + + +# %% +def show_imgs(imgs): + num_imgs = imgs.shape[0] if isinstance(imgs, Tensor) else len(imgs) + nrow = min(num_imgs, 4) + ncol = int(math.ceil(num_imgs / nrow)) + imgs = torchvision.utils.make_grid(imgs, nrow=nrow, pad_value=128) + imgs = imgs.clamp(min=0, max=255) + np_imgs = imgs.cpu().numpy() + plt.figure(figsize=(1.5 * nrow, 1.5 * ncol)) + plt.imshow(np.transpose(np_imgs, (1, 2, 0)), interpolation="nearest") + plt.axis("off") + plt.show() + plt.close() + + +show_imgs([train_set[i][0] for i in range(8)]) + +# %% [markdown] +# ## Masked autoregressive convolutions +# +# The core module of PixelCNN is its masked convolutions. +# In contrast to language models, we don't apply an LSTM on each pixel one-by-one. +# This would be inefficient because images are grids instead of sequences. +# Thus, it is better to rely on convolutions that have shown great success in deep CNN classification models. +# +# Nevertheless, we cannot just apply standard convolutions without any changes. +# Remember that during training of autoregressive models, we want to use teacher forcing which both helps the model training, and significantly reduces the time needed for training. +# For image modeling, teacher forcing is implemented by using a training image as input to the model, and we want to obtain as output the prediction for each pixel based on *only* its predecessors. +# Thus, we need to ensure that the prediction for a specific pixel can only be influenced by its predecessors and not by its own value or any "future" pixels. +# For this, we apply convolutions with a mask. +# +# Which mask we use depends on the ordering of pixels we decide on, i.e. which is the first pixel we predict, +# which is the second one, etc. +# The most commonly used ordering is to denote the upper left pixel as the start pixel, +# and sort the pixels row by row, as shown in the visualization at the top of the tutorial. +# Thus, the second pixel is on the right of the first one (first row, second column), +# and once we reach the end of the row, we start in the second row, first column. +# If we now want to apply this to our convolutions, we need to ensure that the prediction of pixel 1 +# is not influenced by its own "true" input, and all pixels on its right and in any lower row. +# In convolutions, this means that we want to set those entries of the weight matrix to zero that take pixels on the right and below into account. +# As an example for a 5x5 kernel, see a mask below (figure credit - [Aaron van den Oord](https://arxiv.org/pdf/1606.05328.pdf)): +# +#
+# +# Before looking into the application of masked convolutions in PixelCNN +# in detail, let's first implement a module that allows us to apply an +# arbitrary mask to a convolution: + + +# %% +class MaskedConvolution(nn.Module): + def __init__(self, c_in, c_out, mask, **kwargs): + """Implements a convolution with mask applied on its weights. + + Args: + c_in: Number of input channels + c_out: Number of output channels + mask: Tensor of shape [kernel_size_H, kernel_size_W] with 0s where + the convolution should be masked, and 1s otherwise. + kwargs: Additional arguments for the convolution + """ + super().__init__() + # For simplicity: calculate padding automatically + kernel_size = (mask.shape[0], mask.shape[1]) + dilation = 1 if "dilation" not in kwargs else kwargs["dilation"] + padding = tuple(dilation * (kernel_size[i] - 1) // 2 for i in range(2)) + # Actual convolution + self.conv = nn.Conv2d(c_in, c_out, kernel_size, padding=padding, **kwargs) + + # Mask as buffer => it is no parameter but still a tensor of the module + # (must be moved with the devices) + self.register_buffer("mask", mask[None, None]) + + def forward(self, x): + self.conv.weight.data *= self.mask # Ensures zero's at masked positions + return self.conv(x) + + +# %% [markdown] +# ### Vertical and horizontal convolution stacks +# +# To build our own autoregressive image model, we could simply stack a few masked convolutions on top of each other. +# This was actually the case for the original PixelCNN model, discussed in the paper +# [Pixel Recurrent Neural Networks](https://arxiv.org/pdf/1601.06759.pdf), but this leads to a considerable issue. +# When sequentially applying a couple of masked convolutions, the receptive field of a pixel +# show to have a "blind spot" on the right upper side, as shown in the figure below +# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/pdf/1606.05328.pdf)): +# +#
+# +# Although a pixel should be able to take into account all other pixels above and left of it, +# a stack of masked convolutions does not allow us to look to the upper pixels on the right. +# This is because the features of the pixels above, which we use for convolution, +# do not contain any information of the pixels on the right of the same row. +# If they would, we would be "cheating" and actually looking into the future. +# To overcome this issue, van den Oord et. +# al [2] proposed to split the convolutions into a vertical and a horizontal stack. +# The vertical stack looks at all pixels above the current one, while the horizontal takes into account all on the left. +# While keeping both of them separate, we can actually look at the pixels on the right with the vertical stack without breaking any of our assumptions. +# The two convolutions are also shown in the figure above. +# +# Let us implement them here as follows: + + +# %% +class VerticalStackConvolution(MaskedConvolution): + def __init__(self, c_in, c_out, kernel_size=3, mask_center=False, **kwargs): + # Mask out all pixels below. For efficiency, we could also reduce the kernel + # size in height, but for simplicity, we stick with masking here. + mask = torch.ones(kernel_size, kernel_size) + mask[kernel_size // 2 + 1 :, :] = 0 + + # For the very first convolution, we will also mask the center row + if mask_center: + mask[kernel_size // 2, :] = 0 + + super().__init__(c_in, c_out, mask, **kwargs) + + +class HorizontalStackConvolution(MaskedConvolution): + def __init__(self, c_in, c_out, kernel_size=3, mask_center=False, **kwargs): + # Mask out all pixels on the left. Note that our kernel has a size of 1 + # in height because we only look at the pixel in the same row. + mask = torch.ones(1, kernel_size) + mask[0, kernel_size // 2 + 1 :] = 0 + + # For the very first convolution, we will also mask the center pixel + if mask_center: + mask[0, kernel_size // 2] = 0 + + super().__init__(c_in, c_out, mask, **kwargs) + + +# %% [markdown] +# Note that we have an input argument called `mask_center`. Remember that +# the input to the model is the actual input image. Hence, the very first +# convolution we apply cannot use the center pixel as input, but must be +# masked. All consecutive convolutions, however, should use the center +# pixel as we otherwise lose the features of the previous layer. Hence, +# the input argument `mask_center` is True for the very first +# convolutions, and False for all others. + +# %% [markdown] +# ### Visualizing the receptive field +# +# To validate our implementation of masked convolutions, we can visualize the receptive field we obtain with such convolutions. +# We should see that with increasing number of convolutional layers, the receptive field grows in both vertical and horizontal direction, without the issue of a blind spot. +# The receptive field can be empirically measured by backpropagating an arbitrary loss for the output features of a speicifc pixel with respect to the input. +# We implement this idea below, and visualize the receptive field below. + +# %% +inp_img = torch.zeros(1, 1, 11, 11) +inp_img.requires_grad_() + + +def show_center_recep_field(img, out): + """Calculates the gradients of the input with respect to the output center pixel, and visualizes the overall + receptive field. + + Args: + img: Input image for which we want to calculate the receptive field on. + out: Output features/loss which is used for backpropagation, and should be + the output of the network/computation graph. + """ + # Determine gradients + loss = out[0, :, img.shape[2] // 2, img.shape[3] // 2].sum() # L1 loss for simplicity + # Retain graph as we want to stack multiple layers and show the receptive field of all of them + loss.backward(retain_graph=True) + img_grads = img.grad.abs() + img.grad.fill_(0) # Reset grads + + # Plot receptive field + img = img_grads.squeeze().cpu().numpy() + fig, ax = plt.subplots(1, 2) + _ = ax[0].imshow(img) + ax[1].imshow(img > 0) + # Mark the center pixel in red if it doesn't have any gradients (should be + # the case for standard autoregressive models) + show_center = img[img.shape[0] // 2, img.shape[1] // 2] == 0 + if show_center: + center_pixel = np.zeros(img.shape + (4,)) + center_pixel[center_pixel.shape[0] // 2, center_pixel.shape[1] // 2, :] = np.array([1.0, 0.0, 0.0, 1.0]) + for i in range(2): + ax[i].axis("off") + if show_center: + ax[i].imshow(center_pixel) + ax[0].set_title("Weighted receptive field") + ax[1].set_title("Binary receptive field") + plt.show() + plt.close() + + +show_center_recep_field(inp_img, inp_img) + +# %% [markdown] +# Let's first visualize the receptive field of a horizontal convolution +# without the center pixel. We use a small, arbitrary input image +# ($11\times 11$ pixels), and calculate the loss for the center pixel. For +# simplicity, we initialize all weights with 1 and the bias with 0, and +# use a single channel. This is sufficient for our visualization purposes. + +# %% +horiz_conv = HorizontalStackConvolution(c_in=1, c_out=1, kernel_size=3, mask_center=True) +horiz_conv.conv.weight.data.fill_(1) +horiz_conv.conv.bias.data.fill_(0) +horiz_img = horiz_conv(inp_img) +show_center_recep_field(inp_img, horiz_img) + +# %% [markdown] +# The receptive field is shown in yellow, the center pixel in red, and all other pixels outside of the receptive field are dark blue. +# As expected, the receptive field of a single horizontal convolution with the center pixel masked and a $3\times3$ kernel is only the pixel on the left. +# If we use a larger kernel size, more pixels would be taken into account on the left. +# +# Next, let's take a look at the vertical convolution: + +# %% +vert_conv = VerticalStackConvolution(c_in=1, c_out=1, kernel_size=3, mask_center=True) +vert_conv.conv.weight.data.fill_(1) +vert_conv.conv.bias.data.fill_(0) +vert_img = vert_conv(inp_img) +show_center_recep_field(inp_img, vert_img) + +# %% [markdown] +# The vertical convolution takes all pixels above into account. Combining +# these two, we get the L-shaped receptive field of the original masked +# convolution: + +# %% +horiz_img = vert_img + horiz_img +show_center_recep_field(inp_img, horiz_img) + +# %% [markdown] +# If we stack multiple horizontal and vertical convolutions, we need to take two aspects into account: +# +# 1. +# The center should not be masked anymore for the following convolutions as the features at the pixel's position are already independent of its actual value. +# If it is hard to imagine why we can do this, just change the value below to `mask_center=True` and see what happens. +# 2. +# The vertical convolution is not allowed to work on features from the horizontal convolution. +# In the feature map of the horizontal convolutions, a pixel contains information about all of the "true" pixels on the left. +# If we apply a vertical convolution which also uses features from the right, we effectively expand our receptive field to the true input which we want to prevent. +# Thus, the feature maps can only be merged for the horizontal convolution. +# +# Using this, we can stack the convolutions in the following way. We have +# two feature streams: one for the vertical stack, and one for the +# horizontal stack. The horizontal convolutions can operate on the joint +# features of the previous horizontals and vertical convolutions, while +# the vertical stack only takes its own previous features as input. For a +# quick implementation, we can therefore sum the horizontal and vertical +# output features at each layer, and use those as final output features to +# calculate the loss on. An implementation of 4 consecutive layers is +# shown below. Note that we reuse the features from the other convolutions +# with `mask_center=True` from above. + +# %% +# Initialize convolutions with equal weight to all input pixels +horiz_conv = HorizontalStackConvolution(c_in=1, c_out=1, kernel_size=3, mask_center=False) +horiz_conv.conv.weight.data.fill_(1) +horiz_conv.conv.bias.data.fill_(0) +vert_conv = VerticalStackConvolution(c_in=1, c_out=1, kernel_size=3, mask_center=False) +vert_conv.conv.weight.data.fill_(1) +vert_conv.conv.bias.data.fill_(0) + +# We reuse our convolutions for the 4 layers here. Note that in a standard network, +# we don't do that, and instead learn 4 separate convolution. As this cell is only for +# visualization purposes, we reuse the convolutions for all layers. +for l_idx in range(4): + vert_img = vert_conv(vert_img) + horiz_img = horiz_conv(horiz_img) + vert_img + print("Layer %i" % (l_idx + 2)) + show_center_recep_field(inp_img, horiz_img) + +# %% [markdown] +# The receptive field above it visualized for the horizontal stack, which includes the features of the vertical convolutions. +# It grows over layers without any blind spot as we had before. +# The difference between "weighted" and "binary" receptive field is that for the latter, we check whether there are any gradients flowing back to this pixel. +# This indicates that the center pixel indeed can use information from this pixel. +# Nevertheless, due to the convolution weights, some pixels have a stronger effect on the prediction than others. +# This is visualized in the weighted receptive field by plotting the gradient magnitude for each pixel instead of a binary yes/no. +# +# +# Another receptive field we can check is the one for the vertical stack +# as the one above is for the horizontal stack. Let's visualize it below: + +# %% +show_center_recep_field(inp_img, vert_img) + +# %% [markdown] +# As we have discussed before, the vertical stack only looks at pixels above the one we want to predict. +# Hence, we can validate that our implementation works as we initially expected it to. +# As a final step, let's clean up the computation graph we still had kept +# in memory for the visualization of the receptive field: + +# %% +del inp_img, horiz_conv, vert_conv + +# %% [markdown] +# ## Gated PixelCNN +# +#
+# +# In the next step, we will use the masked convolutions to build a full autoregressive model, called Gated PixelCNN. +# The difference between the original PixelCNN and Gated PixelCNN is the use of separate horizontal and vertical stacks. +# However, in literature, you often see that people refer to the Gated PixelCNN simply as "PixelCNN". +# Hence, in the following, if we say "PixelCNN", we usually mean the gated version. +# What "Gated" refers to in the model name is explained next. +# +# ### Gated Convolutions +# +# For visualizing the receptive field, we assumed a very simplified stack of vertical and horizontal convolutions. +# Obviously, there are more sophisticated ways of doing it, and PixelCNN uses gated convolutions for this. +# Specifically, the Gated Convolution block in PixelCNN looks as follows +# (figure credit - [Aaron van den Oord et al. ](https://arxiv.org/pdf/1606.05328.pdf)): +# +#
+# +# The left path is the vertical stack (the $N\times N$ convolution is masked correspondingly), +# and the right path is the horizontal stack. +# Gated convolutions are implemented by having a twice as large output channel size, +# and combine them by a element-wise multiplication of $\tanh$ and a sigmoid. +# For a linear layer, we can express a gated activation unit as follows: +# +# $$\mathbf{y} = \tanh\left(\mathbf{W}_{f}\mathbf{x}\right)\odot\sigma\left(\mathbf{W}_{g}\mathbf{x}\right)$$ +# +# For simplicity, biases have been neglected and the linear layer split into two part, $\mathbf{W}_{f}$ and $\mathbf{W}_{g}$. +# This concept resembles the input and modulation gate in an LSTM, and has been used in many other architectures as well. +# The main motivation behind this gated activation is that it might allow to model more complex interactions and simplifies learning. +# But as in any other architecture, this is mostly a design choice and can be considered a hyperparameters. +# +# Besides the gated convolutions, we also see that the horizontal stack uses a residual connection while the vertical stack does not. +# This is because we use the output of the horizontal stack for prediction. +# Each convolution in the vertical stack also receives a strong gradient signal +# as it is only two $1\times 1$ convolutions away from the residual connection, +# and does not require another residual connection to all its earleri layers. +# +# The implementation in PyTorch is fairly straight forward for this block, +# because the visualization above gives us a computation graph to follow: + + +# %% +class GatedMaskedConv(nn.Module): + def __init__(self, c_in, **kwargs): + """Gated Convolution block implemented the computation graph shown above.""" + super().__init__() + self.conv_vert = VerticalStackConvolution(c_in, c_out=2 * c_in, **kwargs) + self.conv_horiz = HorizontalStackConvolution(c_in, c_out=2 * c_in, **kwargs) + self.conv_vert_to_horiz = nn.Conv2d(2 * c_in, 2 * c_in, kernel_size=1, padding=0) + self.conv_horiz_1x1 = nn.Conv2d(c_in, c_in, kernel_size=1, padding=0) + + def forward(self, v_stack, h_stack): + # Vertical stack (left) + v_stack_feat = self.conv_vert(v_stack) + v_val, v_gate = v_stack_feat.chunk(2, dim=1) + v_stack_out = torch.tanh(v_val) * torch.sigmoid(v_gate) + + # Horizontal stack (right) + h_stack_feat = self.conv_horiz(h_stack) + h_stack_feat = h_stack_feat + self.conv_vert_to_horiz(v_stack_feat) + h_val, h_gate = h_stack_feat.chunk(2, dim=1) + h_stack_feat = torch.tanh(h_val) * torch.sigmoid(h_gate) + h_stack_out = self.conv_horiz_1x1(h_stack_feat) + h_stack_out = h_stack_out + h_stack + + return v_stack_out, h_stack_out + + +# %% [markdown] +# ### Building the model +# +# Using the gated convolutions, we can now build our PixelCNN model. +# The architecture consists of multiple stacked GatedMaskedConv blocks, where we add an additional dilation factor to a few convolutions. +# This is used to increase the receptive field of the model and allows to take a larger context into accout during generation. +# As a reminder, dilation on a convolution works looks as follows +# (figure credit - [Vincent Dumoulin and Francesco Visin](https://arxiv.org/pdf/1603.07285.pdf)): +# +#
+# +# Note that the smaller output size is only because the animation assumes no padding. +# In our implementation, we will pad the input image correspondingly. +# Alternatively to dilated convolutions, we could downsample the input and use a encoder-decoder architecture as in PixelCNN++ [3]. +# This is especially beneficial if we want to build a very deep autoregressive model. +# Nonetheless, as we seek to train a reasonably small model, dilated convolutions are the more efficient option to use here. +# +# Below, we implement the PixelCNN model as a PyTorch Lightning module. +# Besides the stack of gated convolutions, we also have the initial +# horizontal and vertical convolutions which mask the center pixel, and a +# final $1\times 1$ convolution which maps the output features to class +# predictions. To determine the likelihood of a batch of images, we first +# create our initial features using the masked horizontal and vertical +# input convolution. Next, we forward the features through the stack of +# gated convolutions. Finally, we take the output features of the +# horizontal stack, and apply the $1\times 1$ convolution for +# classification. We use the bits per dimension metric for the likelihood, +# similarly to Tutorial 11 and assignment 3. + + +# %% +class PixelCNN(L.LightningModule): + def __init__(self, c_in, c_hidden): + super().__init__() + self.save_hyperparameters() + + # Initial convolutions skipping the center pixel + self.conv_vstack = VerticalStackConvolution(c_in, c_hidden, mask_center=True) + self.conv_hstack = HorizontalStackConvolution(c_in, c_hidden, mask_center=True) + # Convolution block of PixelCNN. We use dilation instead of downscaling + self.conv_layers = nn.ModuleList( + [ + GatedMaskedConv(c_hidden), + GatedMaskedConv(c_hidden, dilation=2), + GatedMaskedConv(c_hidden), + GatedMaskedConv(c_hidden, dilation=4), + GatedMaskedConv(c_hidden), + GatedMaskedConv(c_hidden, dilation=2), + GatedMaskedConv(c_hidden), + ] + ) + # Output classification convolution (1x1) + self.conv_out = nn.Conv2d(c_hidden, c_in * 256, kernel_size=1, padding=0) + + self.example_input_array = train_set[0][0][None] + + def forward(self, x): + """Forward image through model and return logits for each pixel. + + Args: + x: Image tensor with integer values between 0 and 255. + """ + # Scale input from 0 to 255 back to -1 to 1 + x = (x.float() / 255.0) * 2 - 1 + + # Initial convolutions + v_stack = self.conv_vstack(x) + h_stack = self.conv_hstack(x) + # Gated Convolutions + for layer in self.conv_layers: + v_stack, h_stack = layer(v_stack, h_stack) + # 1x1 classification convolution + # Apply ELU before 1x1 convolution for non-linearity on residual connection + out = self.conv_out(F.elu(h_stack)) + + # Output dimensions: [Batch, Classes, Channels, Height, Width] + out = out.reshape(out.shape[0], 256, out.shape[1] // 256, out.shape[2], out.shape[3]) + return out + + def calc_likelihood(self, x): + # Forward pass with bpd likelihood calculation + pred = self.forward(x) + nll = F.cross_entropy(pred, x, reduction="none") + bpd = nll.mean(dim=[1, 2, 3]) * np.log2(np.exp(1)) + return bpd.mean() + + @torch.no_grad() + def sample(self, img_shape, img=None): + """Sampling function for the autoregressive model. + + Args: + img_shape: Shape of the image to generate (B,C,H,W) + img (optional): If given, this tensor will be used as + a starting image. The pixels to fill + should be -1 in the input tensor. + """ + # Create empty image + if img is None: + img = torch.zeros(img_shape, dtype=torch.long).to(device) - 1 + # Generation loop + for h in tqdm(range(img_shape[2]), leave=False): + for w in range(img_shape[3]): + for c in range(img_shape[1]): + # Skip if not to be filled (-1) + if (img[:, c, h, w] != -1).all().item(): + continue + # For efficiency, we only have to input the upper part of the image + # as all other parts will be skipped by the masked convolutions anyways + pred = self.forward(img[:, :, : h + 1, :]) + probs = F.softmax(pred[:, :, c, h, w], dim=-1) + img[:, c, h, w] = torch.multinomial(probs, num_samples=1).squeeze(dim=-1) + return img + + def configure_optimizers(self): + optimizer = optim.Adam(self.parameters(), lr=1e-3) + scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.99) + return [optimizer], [scheduler] + + def training_step(self, batch, batch_idx): + loss = self.calc_likelihood(batch[0]) + self.log("train_bpd", loss) + return loss + + def validation_step(self, batch, batch_idx): + loss = self.calc_likelihood(batch[0]) + self.log("val_bpd", loss) + + def test_step(self, batch, batch_idx): + loss = self.calc_likelihood(batch[0]) + self.log("test_bpd", loss) + + +# %% [markdown] +# To sample from the autoregressive model, we need to iterate over all dimensions of the input. +# We start with an empty image, and fill the pixels one by one, starting from the upper left corner. +# Note that as for predicting $x_i$, all pixels below it have no influence on the prediction. +# Hence, we can cut the image in height without changing the prediction while increasing efficiency. +# Nevertheless, all the loops in the sampling function already show that it will take us quite some time to sample. +# A lot of computation could be reused across loop iterations as those the features on the already predicted pixels will not change over iterations. +# Nevertheless, this takes quite some effort to implement, and is often not done in implementations because in the end, +# autoregressive sampling remains sequential and slow. +# Hence, we settle with the default implementation here. +# +# Before training the model, we can check the full receptive field of the model on an MNIST image of size $28\times 28$: + +# %% +test_model = PixelCNN(c_in=1, c_hidden=64) +inp = torch.zeros(1, 1, 28, 28) +inp.requires_grad_() +out = test_model(inp) +show_center_recep_field(inp, out.squeeze(dim=2)) +del inp, out, test_model + +# %% [markdown] +# The visualization shows that for predicting any pixel, we can take almost half of the image into account. +# However, keep in mind that this is the "theoretical" receptive field and not necessarily +# the [effective receptive field](https://arxiv.org/pdf/1701.04128.pdf), which is usually much smaller. +# For a stronger model, we should therefore try to increase the receptive +# field even further. Especially, for the pixel on the bottom right, the +# very last pixel, we would be allowed to take into account the whole +# image. However, our current receptive field only spans across 1/4 of the +# image. An encoder-decoder architecture can help with this, but it also +# shows that we require a much deeper, more complex network in +# autoregressive models than in VAEs or energy-based models. + +# %% [markdown] +# ### Training loop +# +# To train the model, we again can rely on PyTorch Lightning and write a +# function below for loading the pretrained model if it exists. To reduce +# the computational cost, we have saved the validation and test score in +# the checkpoint already: + + +# %% +def train_model(**kwargs): + # Create a PyTorch Lightning trainer with the generation callback + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "PixelCNN"), + accelerator="auto", + devices=1, + max_epochs=150, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="min", monitor="val_bpd"), + LearningRateMonitor("epoch"), + ], + ) + result = None + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "PixelCNN.ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model, loading...") + model = PixelCNN.load_from_checkpoint(pretrained_filename) + ckpt = torch.load(pretrained_filename, map_location=device) + result = ckpt.get("result", None) + else: + model = PixelCNN(**kwargs) + trainer.fit(model, train_loader, val_loader) + model = model.to(device) + + if result is None: + # Test best model on validation and test set + val_result = trainer.test(model, dataloaders=val_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"test": test_result, "val": val_result} + return model, result + + +# %% [markdown] +# Training the model is time consuming and we recommend using the provided pre-trained model for going through this notebook. +# However, feel free to play around with the hyperparameter like number of layers etc. +# if you want to get a feeling for those. +# +# When calling the training function with a pre-trained model, we automatically load it and print its test performance: + +# %% +model, result = train_model(c_in=1, c_hidden=64) +test_res = result["test"][0] +print( + "Test bits per dimension: %4.3fbpd" % (test_res["test_loss"] if "test_loss" in test_res else test_res["test_bpd"]) +) + +# %% [markdown] +# With a test performance of 0.809bpd, the PixelCNN significantly outperforms the normalizing flows we have seen in Tutorial 11. +# Considering image modeling as an autoregressive problem simplifies the learning process as predicting +# one pixel given the ground truth of all others is much easier than predicting all pixels at once. +# In addition, PixelCNN can explicitly predict the pixel values by a discrete softmax while +# Normalizing Flows have to learn transformations in continuous latent space. +# These two aspects allow the PixelCNN to achieve a notably better performance. +# +# To fully compare the models, let's also measure the number of parameters of the PixelCNN: + +# %% +num_params = sum(np.prod(param.shape) for param in model.parameters()) +print(f"Number of parameters: {num_params:,}") + +# %% [markdown] +# Compared to the multi-scale normalizing flows, the PixelCNN has considerably less parameters. +# Of course, the number of parameters depend on our hyperparameter choices. +# Nevertheless, in general, it can be said that autoregressive models +# require considerably less parameters than normalizing flows to reach +# good performance, based on the reasons stated above. Still, +# autoregressive models are much slower in sampling than normalizing +# flows, which limits their possible applications. + +# %% [markdown] +# ## Sampling +# +# One way of qualitatively analysing generative models is by looking at the actual samples. +# Let's therefore use our sampling function to generate a few digits: + +# %% +L.seed_everything(1) +samples = model.sample(img_shape=(16, 1, 28, 28)) +show_imgs(samples.cpu()) + +# %% [markdown] +# Most of the samples can be identified as digits, and overall we achieve a better quality than we had in normalizing flows. +# This goes along with the lower likelihood we achieved with autoregressive models. +# Nevertheless, we also see that there is still place for improvement +# as a considerable amount of samples cannot be identified (for example the first row). +# Deeper autoregressive models are expected to achieve better quality, +# as they can take more context into account for generating the pixels. +# +# Note that on Google Colab, you might see different results, specifically with a white line at the top. +# After some debugging, it seemed that the difference occurs inside the dilated convolution, +# as it gives different results for different batch sizes. +# However, it is hard to debug this further as it might be a bug of the installed PyTorch version on Google Colab. +# +# The trained model itself is not restricted to any specific image size. +# However, what happens if we actually sample a larger image than we had +# seen in our training dataset? Let's try below to sample images of size +# $64\times64$ instead of $28\times28$: + +# %% +L.seed_everything(1) +samples = model.sample(img_shape=(8, 1, 64, 64)) +show_imgs(samples.cpu()) + +# %% [markdown] +# The larger images show that changing the size of the image during testing confuses the model +# and generates abstract figures (you can sometimes spot a digit in the upper left corner). +# In addition, sampling for images of 64x64 pixels take more than a minute on a GPU. +# Clearly, autoregressive models cannot be scaled to large images without changing the sampling procedure such as with [forecasting](https://arxiv.org/abs/2002.09928). +# Our implementation is also not the most efficient as many computations can be stored and reused throughout the sampling process. +# Nevertheless, the sampling procedure stays sequential which is +# inherently slower than parallel generation like done in normalizing +# flows. + +# %% [markdown] +# ### Autocompletion +# +# One common application done with autoregressive models is +# auto-completing an image. As autoregressive models predict pixels one by +# one, we can set the first $N$ pixels to predefined values and check how +# the model completes the image. For implementing this, we just need to +# skip the iterations in the sampling loop that already have a value +# unequals -1. See above in our PyTorch Lightning module for the specific +# implementation. In the cell below, we randomly take three images from +# the training set, mask about the lower half of the image, and let the +# model autocomplete it. To see the diversity of samples, we do this 12 +# times for each image: + + +# %% +def autocomplete_image(img): + # Remove lower half of the image + img_init = img.clone() + img_init[:, 10:, :] = -1 + print("Original image and input image to sampling:") + show_imgs([img, img_init]) + # Generate 12 example completions + img_init = img_init.unsqueeze(dim=0).expand(12, -1, -1, -1).to(device) + L.seed_everything(1) + img_generated = model.sample(img_init.shape, img_init) + print("Autocompletion samples:") + show_imgs(img_generated) + + +for i in range(1, 4): + img = train_set[i][0] + autocomplete_image(img) + +# %% [markdown] +# For the first two digits (7 and 6), we see that the 12 samples all +# result in a shape which resemble the original digit. Nevertheless, there +# are some style difference in writing the 7, and some deformed sixes in +# the samples. When autocompleting the 9 below, we see that the model can +# fit multiple digits to it. We obtain diverse samples from 0, 3, 8 and 9. +# This shows that despite having no latent space, we can still obtain +# diverse samples from an autoregressive model. + +# %% [markdown] +# ### Visualization of the predictive distribution (softmax) +# +# Autoregressive models use a softmax over 256 values to predict the next pixel. +# This gives the model a large flexibility as the probabilities for each pixel value can be learned independently if necessary. +# However, the values are actually not independent because the values 32 and 33 are much closer than 32 and 255. +# In the following, we visualize the softmax distribution that the model predicts to gain insights how it has learned the relationships of close-by pixels. +# +# To do this, we first run the model on a batch of images and store the output softmax distributions: + +# %% +det_loader = data.DataLoader(train_set, batch_size=128, shuffle=False, drop_last=False) +imgs, _ = next(iter(det_loader)) +imgs = imgs.to(device) +with torch.no_grad(): + out = model(imgs) + out = F.softmax(out, dim=1) + mean_out = out.mean(dim=[0, 2, 3, 4]).cpu().numpy() + out = out.cpu().numpy() + +# %% [markdown] +# Before diving into the model, let's visualize the distribution of the pixel values in the whole dataset: + +# %% +sns.set() +plot_args = {"color": to_rgb("C0") + (0.5,), "edgecolor": "C0", "linewidth": 0.5, "width": 1.0} +plt.hist(imgs.view(-1).cpu().numpy(), bins=256, density=True, **plot_args) +plt.yscale("log") +plt.xticks([0, 64, 128, 192, 256]) +plt.show() +plt.close() + +# %% [markdown] +# As we would expect from the seen images, the pixel value 0 (black) is the dominant value, followed by a batch of values between 250 and 255. +# Note that we use a log scale on the y-axis due to the big imbalance in the dataset. +# Interestingly, the pixel values 64, 128 and 191 also stand out which is likely due to the quantization used during the creation of the dataset. +# For RGB images, we would also see two peaks around 0 and 255, +# but the values in between would be much more frequent than in MNIST +# (see Figure 1 in the [PixelCNN++](https://arxiv.org/pdf/1701.05517.pdf) for a visualization on CIFAR10). +# +# Next, we can visualize the distribution our model predicts (in average): + +# %% +plt.bar(np.arange(mean_out.shape[0]), mean_out, **plot_args) +plt.yscale("log") +plt.xticks([0, 64, 128, 192, 256]) +plt.show() +plt.close() + +# %% [markdown] +# This distribution is very close to the actual dataset distribution. +# This is in general a good sign, but we can see a slightly smoother histogram than above. +# +# Finally, to take a closer look at learned value relations, we can +# visualize the distribution for individual pixel predictions to get a +# better intuition. For this, we pick 4 random images and pixels, and +# visualize their distribution below: + +# %% +fig, ax = plt.subplots(2, 2, figsize=(10, 6)) +for i in range(4): + ax_sub = ax[i // 2][i % 2] + ax_sub.bar(np.arange(out.shape[1], dtype=np.int32), out[i + 4, :, 0, 14, 14], **plot_args) + ax_sub.set_yscale("log") + ax_sub.set_xticks([0, 64, 128, 192, 256]) +plt.show() +plt.close() + +# %% [markdown] +# Overall we see a very diverse set of distributions, with a usual peak +# for 0 and close to 1. However, the distributions in the first row show a +# potentially undesirable behavior. For instance, the value 242 has a +# 1000x lower likelihood than 243 although they are extremely close and +# can often not be distinguished. This shows that the model might have not +# generlized well over pixel values. The better solution to this problem +# is to use discrete logitics mixtures instead of a softmax distribution. +# A discrete logistic distribution can be imagined as discretized, binned +# Gaussians. Using a mixture of discrete logistics instead of a softmax +# introduces an inductive bias to the model to assign close-by values +# similar likelihoods. We can visualize a discrete logistic below: + +# %% +mu = Tensor([128]) +sigma = Tensor([2.0]) + + +def discrete_logistic(x, mu, sigma): + return torch.sigmoid((x + 0.5 - mu) / sigma) - torch.sigmoid((x - 0.5 - mu) / sigma) + + +x = torch.arange(256) +p = discrete_logistic(x, mu, sigma) + +# Visualization +plt.figure(figsize=(6, 3)) +plt.bar(x.numpy(), p.numpy(), **plot_args) +plt.xlim(96, 160) +plt.title("Discrete logistic distribution") +plt.xlabel("Pixel value") +plt.ylabel("Probability") +plt.show() +plt.close() + +# %% [markdown] +# Instead of the softmax, the model would output mean and standard +# deviations for the $K$ logistics we use in the mixture. This is one of +# the improvements in autoregressive models that PixelCNN++ [3] has +# introduced compared to the original PixelCNN. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have looked at autoregressive image modeling, and +# implemented the PixelCNN architecture. With the usage of masked +# convolutions, we are able to apply a convolutional network in which a +# pixel is only influenced by all its predecessors. Separating the masked +# convolution into a horizontal and vertical stack allowed us to remove +# the known blind spot on the right upper row of a pixel. In experiments, +# autoregressive models outperformed normalizing flows in terms of bits +# per dimension, but are much slower to sample from. Improvements, that we +# have not implemented ourselves here, are discrete logistic mixtures, a +# downsampling architecture, and changing the pixel order in a diagonal +# fashion (see PixelSNAIL). Overall, autoregressive models are another, +# strong family of generative models, which however are mostly used in +# sequence tasks because of their linear scaling in sampling time than +# quadratic as on images. + +# %% [markdown] +# ## References +# [1] van den Oord, A., et al. +# "Pixel Recurrent Neural Networks." +# arXiv preprint arXiv:1601.06759 (2016). +# [Link](https://arxiv.org/abs/1601.06759) +# +# [2] van den Oord, A., et al. +# "Conditional Image Generation with PixelCNN Decoders." +# In Advances in Neural Information Processing Systems 29, pp. +# 4790–4798 (2016). +# [Link](http://papers.nips.cc/paper/6527-conditional-image-generation-with-pixelcnn-decoders.pdf) +# +# [3] Salimans, Tim, et al. +# "PixelCNN++: Improving the PixelCNN with Discretized Logistic Mixture Likelihood and Other Modifications." +# arXiv preprint arXiv:1701.05517 (2017). +# [Link](https://arxiv.org/abs/1701.05517) diff --git a/course_UvA-DL/10-autoregressive-image-modeling/PixelCNN_GatedConv.svg b/course_UvA-DL/10-autoregressive-image-modeling/PixelCNN_GatedConv.svg new file mode 100644 index 0000000..2104663 --- /dev/null +++ b/course_UvA-DL/10-autoregressive-image-modeling/PixelCNN_GatedConv.svg @@ -0,0 +1,2195 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/10-autoregressive-image-modeling/autoregressive_image_modeling.svg b/course_UvA-DL/10-autoregressive-image-modeling/autoregressive_image_modeling.svg new file mode 100644 index 0000000..8584a8b --- /dev/null +++ b/course_UvA-DL/10-autoregressive-image-modeling/autoregressive_image_modeling.svg @@ -0,0 +1,961 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/10-autoregressive-image-modeling/masked_convolution.svg b/course_UvA-DL/10-autoregressive-image-modeling/masked_convolution.svg new file mode 100644 index 0000000..925dc48 --- /dev/null +++ b/course_UvA-DL/10-autoregressive-image-modeling/masked_convolution.svg @@ -0,0 +1,923 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/10-autoregressive-image-modeling/pixelcnn_blind_spot.svg b/course_UvA-DL/10-autoregressive-image-modeling/pixelcnn_blind_spot.svg new file mode 100644 index 0000000..18ade2f --- /dev/null +++ b/course_UvA-DL/10-autoregressive-image-modeling/pixelcnn_blind_spot.svg @@ -0,0 +1,1091 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/11-vision-transformer/.meta.yml b/course_UvA-DL/11-vision-transformer/.meta.yml new file mode 100644 index 0000000..171d877 --- /dev/null +++ b/course_UvA-DL/11-vision-transformer/.meta.yml @@ -0,0 +1,23 @@ +title: "Tutorial 11: Vision Transformers" +author: Phillip Lippe +created: 2021-08-21 +updated: 2023-03-14 +license: CC BY-SA +description: | + In this tutorial, we will take a closer look at a recent new trend: Transformers for Computer Vision. + Since [Alexey Dosovitskiy et al.](https://openreview.net/pdf?id=YicbFdNTTy) successfully applied a Transformer on a variety of image recognition benchmarks, there have been an incredible amount of follow-up works showing that CNNs might not be optimal architecture for Computer Vision anymore. + But how do Vision Transformers work exactly, and what benefits and drawbacks do they offer in contrast to CNNs? + We will answer these questions by implementing a Vision Transformer ourselves, and train it on the popular, small dataset CIFAR10. + We will compare these results to popular convolutional architectures such as Inception, ResNet and DenseNet. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +tags: + - Image +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/11-vision-transformer/.thumb.jpg b/course_UvA-DL/11-vision-transformer/.thumb.jpg new file mode 100644 index 0000000..c129c4b Binary files /dev/null and b/course_UvA-DL/11-vision-transformer/.thumb.jpg differ diff --git a/course_UvA-DL/11-vision-transformer/Vision_Transformer.py b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py new file mode 100644 index 0000000..a7c419a --- /dev/null +++ b/course_UvA-DL/11-vision-transformer/Vision_Transformer.py @@ -0,0 +1,532 @@ +# %% [markdown] +#
+# Let's start with importing our standard set of libraries. + +# %% +import os +import urllib.request +from urllib.error import HTTPError + +import lightning as L +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import seaborn as sns +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from torchvision import transforms +from torchvision.datasets import CIFAR10 + +plt.set_cmap("cividis") +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.reset_orig() + +# %load_ext tensorboard + +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/VisionTransformers/") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") +print("Device:", device) + +# %% [markdown] +# We provide a pre-trained Vision Transformer which we download in the next cell. +# However, Vision Transformers can be relatively quickly trained on CIFAR10 with an overall training time of less than an hour on an NVIDIA TitanRTX. +# Feel free to experiment with training your own Transformer once you went through the whole notebook. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/" +# Files to download +pretrained_files = [ + "tutorial15/ViT.ckpt", + "tutorial15/tensorboards/ViT/events.out.tfevents.ViT", + "tutorial5/tensorboards/ResNet/events.out.tfevents.resnet", +] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name.split("/", 1)[1]) + if "/" in file_name.split("/", 1)[1]: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# We load the CIFAR10 dataset below. +# We use the same setup of the datasets and data augmentations as for the CNNs in Tutorial 5 to keep a fair comparison. +# The constants in the `transforms.Normalize` correspond to the values +# that scale and shift the data to a zero mean and standard deviation of +# one. + +# %% +test_transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]), + ] +) +# For training, we add some augmentation. Networks are too powerful and would overfit. +train_transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)), + transforms.ToTensor(), + transforms.Normalize([0.49139968, 0.48215841, 0.44653091], [0.24703223, 0.24348513, 0.26158784]), + ] +) +# Loading the training dataset. We need to split it into a training and validation part +# We need to do a little trick because the validation set should not use the augmentation. +train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=train_transform, download=True) +val_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=test_transform, download=True) +L.seed_everything(42) +train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000]) +L.seed_everything(42) +_, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000]) + +# Loading the test set +test_set = CIFAR10(root=DATASET_PATH, train=False, transform=test_transform, download=True) + +# We define a set of data loaders that we can use for various purposes later. +train_loader = data.DataLoader(train_set, batch_size=128, shuffle=True, drop_last=True, pin_memory=True, num_workers=4) +val_loader = data.DataLoader(val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) +test_loader = data.DataLoader(test_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4) + +# Visualize some examples +NUM_IMAGES = 4 +CIFAR_images = torch.stack([val_set[idx][0] for idx in range(NUM_IMAGES)], dim=0) +img_grid = torchvision.utils.make_grid(CIFAR_images, nrow=4, normalize=True, pad_value=0.9) +img_grid = img_grid.permute(1, 2, 0) + +plt.figure(figsize=(8, 8)) +plt.title("Image examples of the CIFAR10 dataset") +plt.imshow(img_grid) +plt.axis("off") +plt.show() +plt.close() + +# %% [markdown] +# ## Transformers for image classification +# +# Transformers have been originally proposed to process sets since it is a permutation-equivariant architecture, i.e., producing the same output permuted if the input is permuted. +# To apply Transformers to sequences, we have simply added a positional encoding to the input feature vectors, and the model learned by itself what to do with it. +# So, why not do the same thing on images? +# This is exactly what [Alexey Dosovitskiy et al. ](https://openreview.net/pdf?id=YicbFdNTTy) proposed in their paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". +# Specifically, the Vision Transformer is a model for image classification that views images as sequences of smaller patches. +# As a preprocessing step, we split an image of, for example, $48\times 48$ pixels into 9 $16\times 16$ patches. +# Each of those patches is considered to be a "word"/"token", and projected to a feature space. +# With adding positional encodings and a token for classification on top, we can apply a Transformer as usual to this sequence and start training it for our task. +# A nice GIF visualization of the architecture is shown below (figure credit - [Phil Wang](https://github.com/lucidrains/vit-pytorch/blob/main/images/vit.gif)): +# +#
+# +# We will walk step by step through the Vision Transformer, and implement all parts by ourselves. +# First, let's implement the image preprocessing: an image of size $N\times N$ has to be split into $(N/M)^2$ patches of size $M\times M$. +# These represent the input words to the Transformer. + + +# %% +def img_to_patch(x, patch_size, flatten_channels=True): + """ + Args: + x: Tensor representing the image of shape [B, C, H, W] + patch_size: Number of pixels per dimension of the patches (integer) + flatten_channels: If True, the patches will be returned in a flattened format + as a feature vector instead of a image grid. + """ + B, C, H, W = x.shape + x = x.reshape(B, C, H // patch_size, patch_size, W // patch_size, patch_size) + x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W] + x = x.flatten(1, 2) # [B, H'*W', C, p_H, p_W] + if flatten_channels: + x = x.flatten(2, 4) # [B, H'*W', C*p_H*p_W] + return x + + +# %% [markdown] +# Let's take a look at how that works for our CIFAR examples above. +# For our images of size $32\times 32$, we choose a patch size of 4. +# Hence, we obtain sequences of 64 patches of size $4\times 4$. +# We visualize them below: + +# %% +img_patches = img_to_patch(CIFAR_images, patch_size=4, flatten_channels=False) + +fig, ax = plt.subplots(CIFAR_images.shape[0], 1, figsize=(14, 3)) +fig.suptitle("Images as input sequences of patches") +for i in range(CIFAR_images.shape[0]): + img_grid = torchvision.utils.make_grid(img_patches[i], nrow=64, normalize=True, pad_value=0.9) + img_grid = img_grid.permute(1, 2, 0) + ax[i].imshow(img_grid) + ax[i].axis("off") +plt.show() +plt.close() + +# %% [markdown] +# Compared to the original images, it is much harder to recognize the objects from those patch lists now. +# Still, this is the input we provide to the Transformer for classifying the images. +# The model has to learn itself how it has to combine the patches to recognize the objects. +# The inductive bias in CNNs that an image is grid of pixels, is lost in this input format. +# +# After we have looked at the preprocessing, we can now start building the Transformer model. +# Since we have discussed the fundamentals of Multi-Head Attention in [Tutorial 6](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html), we will use the PyTorch module `nn.MultiheadAttention` ([docs](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html?highlight=multihead#torch.nn.MultiheadAttention)) here. +# Further, we use the Pre-Layer Normalization version of the Transformer blocks proposed by [Ruibin Xiong et al. ](http://proceedings.mlr.press/v119/xiong20b/xiong20b.pdf) in 2020. +# The idea is to apply Layer Normalization not in between residual blocks, but instead as a first layer in the residual blocks. +# This reorganization of the layers supports better gradient flow and removes the necessity of a warm-up stage. +# A visualization of the difference between the standard Post-LN and the Pre-LN version is shown below. +# +#
+# +# The implementation of the Pre-LN attention block looks as follows: + + +# %% +class AttentionBlock(nn.Module): + def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0): + """Attention Block. + + Args: + embed_dim: Dimensionality of input and attention feature vectors + hidden_dim: Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads: Number of heads to use in the Multi-Head Attention block + dropout: Amount of dropout to apply in the feed-forward network + """ + super().__init__() + + self.layer_norm_1 = nn.LayerNorm(embed_dim) + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.layer_norm_2 = nn.LayerNorm(embed_dim) + self.linear = nn.Sequential( + nn.Linear(embed_dim, hidden_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(hidden_dim, embed_dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + inp_x = self.layer_norm_1(x) + x = x + self.attn(inp_x, inp_x, inp_x)[0] + x = x + self.linear(self.layer_norm_2(x)) + return x + + +# %% [markdown] +# Now we have all modules ready to build our own Vision Transformer. +# Besides the Transformer encoder, we need the following modules: +# +# * A **linear projection** layer that maps the input patches to a feature vector of larger size. +# It is implemented by a simple linear layer that takes each $M\times M$ patch independently as input. +# * A **classification token** that is added to the input sequence. +# We will use the output feature vector of the classification token (CLS token in short) for determining the classification prediction. +# * Learnable **positional encodings** that are added to the tokens before being processed by the Transformer. +# Those are needed to learn position-dependent information, and convert the set to a sequence. +# Since we usually work with a fixed resolution, we can learn the positional encodings instead of having the pattern of sine and cosine functions. +# * A **MLP head** that takes the output feature vector of the CLS token, and maps it to a classification prediction. +# This is usually implemented by a small feed-forward network or even a single linear layer. +# +# With those components in mind, let's implement the full Vision Transformer below: + + +# %% +class VisionTransformer(nn.Module): + def __init__( + self, + embed_dim, + hidden_dim, + num_channels, + num_heads, + num_layers, + num_classes, + patch_size, + num_patches, + dropout=0.0, + ): + """Vision Transformer. + + Args: + embed_dim: Dimensionality of the input feature vectors to the Transformer + hidden_dim: Dimensionality of the hidden layer in the feed-forward networks + within the Transformer + num_channels: Number of channels of the input (3 for RGB) + num_heads: Number of heads to use in the Multi-Head Attention block + num_layers: Number of layers to use in the Transformer + num_classes: Number of classes to predict + patch_size: Number of pixels that the patches have per dimension + num_patches: Maximum number of patches an image can have + dropout: Amount of dropout to apply in the feed-forward network and + on the input encoding + """ + super().__init__() + + self.patch_size = patch_size + + # Layers/Networks + self.input_layer = nn.Linear(num_channels * (patch_size**2), embed_dim) + self.transformer = nn.Sequential( + *(AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)) + ) + self.mlp_head = nn.Sequential(nn.LayerNorm(embed_dim), nn.Linear(embed_dim, num_classes)) + self.dropout = nn.Dropout(dropout) + + # Parameters/Embeddings + self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim)) + self.pos_embedding = nn.Parameter(torch.randn(1, 1 + num_patches, embed_dim)) + + def forward(self, x): + # Preprocess input + x = img_to_patch(x, self.patch_size) + B, T, _ = x.shape + x = self.input_layer(x) + + # Add CLS token and positional encoding + cls_token = self.cls_token.repeat(B, 1, 1) + x = torch.cat([cls_token, x], dim=1) + x = x + self.pos_embedding[:, : T + 1] + + # Apply Transforrmer + x = self.dropout(x) + x = x.transpose(0, 1) + x = self.transformer(x) + + # Perform classification prediction + cls = x[0] + out = self.mlp_head(cls) + return out + + +# %% [markdown] +# Finally, we can put everything into a PyTorch Lightning Module as usual. +# We use `torch.optim.AdamW` as the optimizer, which is Adam with a corrected weight decay implementation. +# Since we use the Pre-LN Transformer version, we do not need to use a learning rate warmup stage anymore. +# Instead, we use the same learning rate scheduler as the CNNs in our previous tutorial on image classification. + + +# %% +class ViT(L.LightningModule): + def __init__(self, model_kwargs, lr): + super().__init__() + self.save_hyperparameters() + self.model = VisionTransformer(**model_kwargs) + self.example_input_array = next(iter(train_loader))[0] + + def forward(self, x): + return self.model(x) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr) + lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], gamma=0.1) + return [optimizer], [lr_scheduler] + + def _calculate_loss(self, batch, mode="train"): + imgs, labels = batch + preds = self.model(imgs) + loss = F.cross_entropy(preds, labels) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + self.log("%s_loss" % mode, loss) + self.log("%s_acc" % mode, acc) + return loss + + def training_step(self, batch, batch_idx): + loss = self._calculate_loss(batch, mode="train") + return loss + + def validation_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="val") + + def test_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="test") + + +# %% [markdown] +# ## Experiments +# +# Commonly, Vision Transformers are applied to large-scale image classification benchmarks such as ImageNet to leverage their full potential. +# However, here we take a step back and ask: can Vision Transformer also succeed on classical, small benchmarks such as CIFAR10? +# To find this out, we train a Vision Transformer from scratch on the CIFAR10 dataset. +# Let's first create a training function for our PyTorch Lightning module +# which also loads the pre-trained model if you have downloaded it above. + + +# %% +def train_model(**kwargs): + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "ViT"), + accelerator="auto", + devices=1, + max_epochs=180, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"), + LearningRateMonitor("epoch"), + ], + ) + trainer.logger._log_graph = True # If True, we plot the computation graph in tensorboard + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "ViT.ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model at %s, loading..." % pretrained_filename) + # Automatically loads the model with the saved hyperparameters + model = ViT.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) # To be reproducable + model = ViT(**kwargs) + trainer.fit(model, train_loader, val_loader) + # Load best checkpoint after training + model = ViT.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # Test best model on validation and test set + val_result = trainer.test(model, dataloaders=val_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"test": test_result[0]["test_acc"], "val": val_result[0]["test_acc"]} + + return model, result + + +# %% [markdown] +# Now, we can already start training our model. +# As seen in our implementation, we have couple of hyperparameter that we have to choose. +# When creating this notebook, we have performed a small grid search over hyperparameters and listed the best hyperparameters in the cell below. +# Nevertheless, it is worth to discuss the influence that each hyperparameter has, and what intuition we have for choosing its value. +# +# First, let's consider the patch size. +# The smaller we make the patches, the longer the input sequences to the Transformer become. +# While in general, this allows the Transformer to model more complex functions, it requires a longer computation time due to its quadratic memory usage in the attention layer. +# Furthermore, small patches can make the task more difficult since the Transformer has to learn which patches are close-by, and which are far away. +# We experimented with patch sizes of 2, 4 and 8 which gives us the input sequence lengths of 256, 64, and 16 respectively. +# We found 4 to result in the best performance, and hence pick it below. +# +# Next, the embedding and hidden dimensionality have a similar impact to a Transformer as to an MLP. +# The larger the sizes, the more complex the model becomes, and the longer it takes to train. +# In Transformer however, we have one more aspect to consider: the query-key sizes in the Multi-Head Attention layers. +# Each key has the feature dimensionality of `embed_dim/num_heads`. +# Considering that we have an input sequence length of 64, a minimum reasonable size for the key vectors is 16 or 32. +# Lower dimensionalities can restrain the possible attention maps too much. +# We observed that more than 8 heads are not necessary for the Transformer, and therefore pick a embedding dimensionality of `256`. +# The hidden dimensionality in the feed-forward networks is usually 2-4x larger than the embedding dimensionality, and thus we pick `512`. +# +# Finally, the learning rate for Transformers is usually relatively small, and in papers, a common value to use is 3e-5. +# However, since we work with a smaller dataset and have a potentially easier task, we found that we are able to increase the learning rate to 3e-4 without any problems. +# To reduce overfitting, we use a dropout value of 0.2. +# Remember that we also use small image augmentations as regularization during training. +# +# Feel free to explore the hyperparameters yourself by changing the values below. +# In general, the Vision Transformer did not show to be too sensitive to +# the hyperparameter choices on the CIFAR10 dataset. + +# %% +model, results = train_model( + model_kwargs={ + "embed_dim": 256, + "hidden_dim": 512, + "num_heads": 8, + "num_layers": 6, + "patch_size": 4, + "num_channels": 3, + "num_patches": 64, + "num_classes": 10, + "dropout": 0.2, + }, + lr=3e-4, +) +print("ViT results", results) + +# %% [markdown] +# The Vision Transformer achieves a validation and test performance of about 75%. +# In comparison, almost all CNN architectures that we have tested in [Tutorial 5](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial5/Inception_ResNet_DenseNet.html) obtained a classification performance of around 90%. +# This is a considerable gap and shows that although Vision Transformers perform strongly on ImageNet with potential pretraining, they cannot come close to simple CNNs on CIFAR10 when being trained from scratch. +# The differences between a CNN and Transformer can be well observed in the training curves. +# Let's look at them in a tensorboard below: + +# %% +# Opens tensorboard in notebook. Adjust the path to your CHECKPOINT_PATH! +# %tensorboard --logdir ../saved_models/tutorial15/tensorboards/ + +# %% [markdown] +#
+ +# %% [markdown] +# The tensorboard compares the Vision Transformer to a ResNet trained on CIFAR10. +# When looking at the training losses, we see that the ResNet learns much more quickly in the first iterations. +# While the learning rate might have an influence on the initial learning speed, we see the same trend in the validation accuracy. +# The ResNet achieves the best performance of the Vision Transformer after just 5 epochs (2000 iterations). +# Further, while the ResNet training loss and validation accuracy have a similar trend, the validation performance of the Vision Transformers only marginally changes after 10k iterations while the training loss has almost just started going down. +# Yet, the Vision Transformer is also able to achieve a close-to 100% accuracy on the training set. +# +# All those observed phenomenons can be explained with a concept that we have visited before: inductive biases. +# Convolutional Neural Networks have been designed with the assumption that images are translation invariant. +# Hence, we apply convolutions with shared filters across the image. +# Furthermore, a CNN architecture integrates the concept of distance in an image: two pixels that are close to each other are more related than two distant pixels. +# Local patterns are combined into larger patterns, until we perform our classification prediction. +# All those aspects are inductive biases of a CNN. +# In contrast, a Vision Transformer does not know which two pixels are close to each other, and which are far apart. +# It has to learn this information solely from the sparse learning signal of the classification task. +# This is a huge disadvantage when we have a small dataset since such information is crucial for generalizing to an unseen test dataset. +# With large enough datasets and/or good pre-training, a Transformer can learn this information without the need of inductive biases, and instead is more flexible than a CNN. +# Especially long-distance relations between local patterns can be difficult to process in CNNs, while in Transformers, all patches have the distance of one. +# This is why Vision Transformers are so strong on large-scale datasets +# such as ImageNet, but underperform a lot when being applied to a small +# dataset such as CIFAR10. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have implemented our own Vision Transformer from scratch and applied it on the task of image classification. +# Vision Transformers work by splitting an image into a sequence of smaller patches, use those as input to a standard Transformer encoder. +# While Vision Transformers achieved outstanding results on large-scale image recognition benchmarks such as ImageNet, they considerably underperform when being trained from scratch on small-scale datasets like CIFAR10. +# The reason is that in contrast to CNNs, Transformers do not have the inductive biases of translation invariance and the feature hierachy (i.e. larger patterns consist of many smaller patterns). +# However, these aspects can be learned when enough data is provided, or the model has been pre-trained on other large-scale tasks. +# Considering that Vision Transformers have just been proposed end of 2020, there is likely a lot more to come on Transformers for Computer Vision. +# +# +# ### References +# +# Dosovitskiy, Alexey, et al. +# "An image is worth 16x16 words: Transformers for image recognition at scale." +# International Conference on Representation Learning (2021). +# [link](https://arxiv.org/pdf/2010.11929.pdf) +# +# Chen, Xiangning, et al. +# "When Vision Transformers Outperform ResNets without Pretraining or Strong Data Augmentations." +# arXiv preprint arXiv:2106.01548 (2021). +# [link](https://arxiv.org/abs/2106.01548) +# +# Tolstikhin, Ilya, et al. +# "MLP-mixer: An all-MLP Architecture for Vision." +# arXiv preprint arXiv:2105.01601 (2021). +# [link](https://arxiv.org/abs/2105.01601) +# +# Xiong, Ruibin, et al. +# "On layer normalization in the transformer architecture." +# International Conference on Machine Learning. +# PMLR, 2020. +# [link](http://proceedings.mlr.press/v119/xiong20b/xiong20b.pdf) diff --git a/course_UvA-DL/11-vision-transformer/pre_layer_norm.svg b/course_UvA-DL/11-vision-transformer/pre_layer_norm.svg new file mode 100644 index 0000000..cb7fbd4 --- /dev/null +++ b/course_UvA-DL/11-vision-transformer/pre_layer_norm.svg @@ -0,0 +1,810 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/11-vision-transformer/tensorboard_screenshot.png b/course_UvA-DL/11-vision-transformer/tensorboard_screenshot.png new file mode 100644 index 0000000..626e930 Binary files /dev/null and b/course_UvA-DL/11-vision-transformer/tensorboard_screenshot.png differ diff --git a/course_UvA-DL/11-vision-transformer/vit_architecture.png b/course_UvA-DL/11-vision-transformer/vit_architecture.png new file mode 100644 index 0000000..446aa44 Binary files /dev/null and b/course_UvA-DL/11-vision-transformer/vit_architecture.png differ diff --git a/course_UvA-DL/12-meta-learning/.meta.yml b/course_UvA-DL/12-meta-learning/.meta.yml new file mode 100644 index 0000000..18f459d --- /dev/null +++ b/course_UvA-DL/12-meta-learning/.meta.yml @@ -0,0 +1,29 @@ +title: "Tutorial 12: Meta-Learning - Learning to Learn" +author: Phillip Lippe +created: 2021-08-21 +updated: 2023-03-14 +license: CC BY-SA +tags: + - Few-shot-learning + - MAML + - ProtoNet +description: | + In this tutorial, we will discuss algorithms that learn models which can quickly adapt to new classes and/or tasks with few samples. + This area of machine learning is called _Meta-Learning_ aiming at "learning to learn". + Learning from very few examples is a natural task for humans. In contrast to current deep learning models, we need to see only a few examples of a police car or firetruck to recognize them in daily traffic. + This is crucial ability since in real-world application, it is rarely the case that the data stays static and does not change over time. + For example, an object detection system for mobile phones trained on data from 2000 will have troubles detecting today's common mobile phones, and thus, needs to adapt to new data without excessive label effort. + The optimization techniques we have discussed so far struggle with this because they only aim at obtaining a good performance on a test set that had similar data. + However, what if the test set has classes that we do not have in the training set? + Or what if we want to test the model on a completely different task? + We will discuss and implement three common Meta-Learning algorithms for such situations. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/12-meta-learning/.thumb.jpg b/course_UvA-DL/12-meta-learning/.thumb.jpg new file mode 100644 index 0000000..4f8f6d9 Binary files /dev/null and b/course_UvA-DL/12-meta-learning/.thumb.jpg differ diff --git a/course_UvA-DL/12-meta-learning/MAML_algorithm.svg b/course_UvA-DL/12-meta-learning/MAML_algorithm.svg new file mode 100644 index 0000000..1812df4 --- /dev/null +++ b/course_UvA-DL/12-meta-learning/MAML_algorithm.svg @@ -0,0 +1,5318 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/12-meta-learning/MAML_figure.svg b/course_UvA-DL/12-meta-learning/MAML_figure.svg new file mode 100644 index 0000000..9e5838b --- /dev/null +++ b/course_UvA-DL/12-meta-learning/MAML_figure.svg @@ -0,0 +1,935 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/12-meta-learning/Meta_Learning.py b/course_UvA-DL/12-meta-learning/Meta_Learning.py new file mode 100644 index 0000000..bf87dd8 --- /dev/null +++ b/course_UvA-DL/12-meta-learning/Meta_Learning.py @@ -0,0 +1,1337 @@ +# %% [markdown] +#
+# Meta-Learning offers solutions to these situations, and we will discuss three popular algorithms: __Prototypical Networks__ ([Snell et al., 2017](https://arxiv.org/pdf/1703.05175.pdf)), __Model-Agnostic Meta-Learning / MAML__ ([Finn et al., 2017](http://proceedings.mlr.press/v70/finn17a.html)), and __Proto-MAML__ ([Triantafillou et al., 2020](https://openreview.net/pdf?id=rkgAGAVKPr)). +# We will focus on the task of few-shot classification where the training and test set have distinct sets of classes. +# For instance, we would train the model on the binary classifications of cats-birds and flowers-bikes, but during test time, the model would need to learn from 4 examples each the difference between dogs and otters, two classes we have not seen during training (Figure credit - [Lilian Weng](https://lilianweng.github.io/lil-log/2018/11/30/meta-learning.html)). +# +#
+# +# A different setup, which is very common in Reinforcement Learning and recently Natural Language Processing, is to aim at few-shot learning of a completely new task. +# For example, an robot agent that learned to run, jump and pick up boxes, should quickly adapt to collecting and stacking boxes. +# In NLP, we can think of a model which was trained sentiment classification, hatespeech detection and sarcasm classification, to adapt to classifying the emotion of a text. +# All methods we will discuss in this notebook can be easily applied to these settings since we only use a different definition of a 'task'. +# For few-shot classification, we consider a task to distinguish between $M$ novel classes. +# Here, we would not only have novel classes, but also a completely different dataset. +# +# First of all, let's start with importing our standard libraries. We will again be using PyTorch Lightning. + +# %% +import json +import os +import random +import urllib.request +from collections import defaultdict +from copy import deepcopy +from statistics import mean, stdev +from urllib.error import HTTPError + +import lightning as L +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import numpy as np +import seaborn as sns +import torch +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from PIL import Image +from torchvision import transforms +from torchvision.datasets import CIFAR100, SVHN +from tqdm.auto import tqdm + +plt.set_cmap("cividis") +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.reset_orig() + +# Import tensorboard +# %load_ext tensorboard + +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/MetaLearning/") + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") +print("Device:", device) + +# %% [markdown] +# Training the models in this notebook can take between 2 and 8 hours, and the evaluation time of some algorithms is in the span of couples of minutes. +# Hence, we download pre-trained models and results below. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial16/" +# Files to download +pretrained_files = [ + "ProtoNet.ckpt", + "ProtoMAML.ckpt", + "tensorboards/ProtoNet/events.out.tfevents.ProtoNet", + "tensorboards/ProtoMAML/events.out.tfevents.ProtoMAML", + "protomaml_fewshot.json", + "protomaml_svhn_fewshot.json", +] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if "/" in file_name: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print("Downloading %s..." % file_url) + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## Few-shot classification +# +# We start our implementation by discussing the dataset setup. +# In this notebook, we will use CIFAR100 which we have already seen in Tutorial 6. +# CIFAR100 has 100 classes each with 600 images of size $32\times 32$ pixels. +# Instead of splitting the training, validation and test set over examples, we will split them over classes: we will use 80 classes for training, and 10 for validation and 10 for testing. +# Our overall goal is to obtain a model that can distinguish between the 10 test classes with seeing very little examples. +# First, let's load the dataset and visualize some examples. + +# %% +# Loading CIFAR100 dataset +cifar_train_set = CIFAR100(root=DATASET_PATH, train=True, download=True, transform=transforms.ToTensor()) +cifar_test_set = CIFAR100(root=DATASET_PATH, train=False, download=True, transform=transforms.ToTensor()) + +# %% +# Visualize some examples +NUM_IMAGES = 12 +cifar_images = [cifar_train_set[np.random.randint(len(cifar_train_set))][0] for idx in range(NUM_IMAGES)] +cifar_images = torch.stack(cifar_images, dim=0) +img_grid = torchvision.utils.make_grid(cifar_images, nrow=6, normalize=True, pad_value=0.9) +img_grid = img_grid.permute(1, 2, 0) + +plt.figure(figsize=(8, 8)) +plt.title("Image examples of the CIFAR100 dataset") +plt.imshow(img_grid) +plt.axis("off") +plt.show() +plt.close() + +# %% [markdown] +# ### Data preprocessing +# +# Next, we need to prepare the dataset in the training, validation and test split as mentioned before. +# The torchvision package gives us the training and test set as two separate dataset objects. +# The next code cells will merge the original training and test set, and then create the new train-val-test split. + +# %% +# Merging original training and test set +cifar_all_images = np.concatenate([cifar_train_set.data, cifar_test_set.data], axis=0) +cifar_all_targets = torch.LongTensor(cifar_train_set.targets + cifar_test_set.targets) + +# %% [markdown] +# To have an easier time handling the dataset, we define our own, simple dataset class below. +# It takes a set of images, labels/targets, and image transformations, and +# returns the corresponding images and labels element-wise. + + +# %% +class ImageDataset(data.Dataset): + def __init__(self, imgs, targets, img_transform=None): + """ + Args: + imgs: Numpy array of shape [N,32,32,3] containing all images. + targets: PyTorch array of shape [N] containing all labels. + img_transform: A torchvision transformation that should be applied + to the images before returning. If none, no transformation + is applied. + """ + super().__init__() + self.img_transform = img_transform + self.imgs = imgs + self.targets = targets + + def __getitem__(self, idx): + img, target = self.imgs[idx], self.targets[idx] + img = Image.fromarray(img) + + if self.img_transform is not None: + img = self.img_transform(img) + + return img, target + + def __len__(self): + return self.imgs.shape[0] + + +# %% [markdown] +# Now, we can create the class splits. +# We will assign the classes randomly to training, validation and test, and use a 80%-10%-10% split. + +# %% +L.seed_everything(0) # Set seed for reproducibility +classes = torch.randperm(100) # Returns random permutation of numbers 0 to 99 +train_classes, val_classes, test_classes = classes[:80], classes[80:90], classes[90:] + +# %% [markdown] +# To get an intuition of the validation and test classes, we print the class names below: + +# %% +# Printing validation and test classes +idx_to_class = {val: key for key, val in cifar_train_set.class_to_idx.items()} +print("Validation classes:", [idx_to_class[c.item()] for c in val_classes]) +print("Test classes:", [idx_to_class[c.item()] for c in test_classes]) + +# %% [markdown] +# As we can see, the classes have quite some variety and some classes might be easier to distinguish than others. +# For instance, in the test classes, 'pickup_truck' is the only vehicle while the classes 'mushroom', 'worm' and 'forest' might be harder to keep apart. +# Remember that we want to learn the classification of those ten classes from 80 other classes in our training set, and few examples from the actual test classes. +# We will experiment with the number of examples per class. +# +# Finally, we can create the training, validation and test dataset according to our split above. +# For this, we create dataset objects of our previously defined class `ImageDataset`. + + +# %% +def dataset_from_labels(imgs, targets, class_set, **kwargs): + class_mask = (targets[:, None] == class_set[None, :]).any(dim=-1) + return ImageDataset(imgs=imgs[class_mask], targets=targets[class_mask], **kwargs) + + +# %% [markdown] +# As in our experiments before on CIFAR in Tutorial 5, 6 and 9, we normalize the dataset. +# Additionally, we use small augmentations during training to prevent overfitting. + +# %% +DATA_MEANS = (cifar_train_set.data / 255.0).mean(axis=(0, 1, 2)) +DATA_STD = (cifar_train_set.data / 255.0).std(axis=(0, 1, 2)) + +test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(DATA_MEANS, DATA_STD)]) +# For training, we add some augmentation. +train_transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)), + transforms.ToTensor(), + transforms.Normalize(DATA_MEANS, DATA_STD), + ] +) + +train_set = dataset_from_labels(cifar_all_images, cifar_all_targets, train_classes, img_transform=train_transform) +val_set = dataset_from_labels(cifar_all_images, cifar_all_targets, val_classes, img_transform=test_transform) +test_set = dataset_from_labels(cifar_all_images, cifar_all_targets, test_classes, img_transform=test_transform) + +# %% [markdown] +# ### Data sampling +# +# The strategy of how to use the available training data for learning few-shot adaptation is crucial in meta-learning. +# All three algorithms that we discuss here have a similar idea: simulate few-shot learning during training. +# Specifically, at each training step, we randomly select a small number of classes, and sample a small number of examples for each class. +# This represents our few-shot training batch, which we also refer to as **support set**. +# Additionally, we sample a second set of examples from the same classes, and refer to this batch as **query set**. +# Our training objective is to classify the query set correctly from seeing the support set and its corresponding labels. +# The main difference between our three methods (ProtoNet, MAML, and Proto-MAML) is in how they use the support set to adapt to the training classes. +# +# This subsection summarizes the code that is needed to create such training batches. +# In PyTorch, we can specify the data sampling procedure by so-called `Sampler` ([documentation](https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler)). +# Samplers are iteratable objects that return indices in the order in which the data elements should be sampled. +# In our previous notebooks, we usually used the option `shuffle=True` in the `data.DataLoader` objects which creates a sampler returning the data indices in a random order. +# Here, we focus on samplers that return batches of indices that correspond to support and query set batches. +# Below, we implement such a sampler. + + +# %% +class FewShotBatchSampler: + def __init__(self, dataset_targets, N_way, K_shot, include_query=False, shuffle=True, shuffle_once=False): + """FewShot Batch Sampler. + + Args: + dataset_targets: PyTorch tensor of the labels of the data elements. + N_way: Number of classes to sample per batch. + K_shot: Number of examples to sample per class in the batch. + include_query: If True, returns batch of size N_way*K_shot*2, which + can be split into support and query set. Simplifies + the implementation of sampling the same classes but + distinct examples for support and query set. + shuffle: If True, examples and classes are newly shuffled in each + iteration (for training) + shuffle_once: If True, examples and classes are shuffled once in + the beginning, but kept constant across iterations + (for validation) + """ + super().__init__() + self.dataset_targets = dataset_targets + self.N_way = N_way + self.K_shot = K_shot + self.shuffle = shuffle + self.include_query = include_query + if self.include_query: + self.K_shot *= 2 + self.batch_size = self.N_way * self.K_shot # Number of overall images per batch + + # Organize examples by class + self.classes = torch.unique(self.dataset_targets).tolist() + self.num_classes = len(self.classes) + self.indices_per_class = {} + self.batches_per_class = {} # Number of K-shot batches that each class can provide + for c in self.classes: + self.indices_per_class[c] = torch.where(self.dataset_targets == c)[0] + self.batches_per_class[c] = self.indices_per_class[c].shape[0] // self.K_shot + + # Create a list of classes from which we select the N classes per batch + self.iterations = sum(self.batches_per_class.values()) // self.N_way + self.class_list = [c for c in self.classes for _ in range(self.batches_per_class[c])] + if shuffle_once or self.shuffle: + self.shuffle_data() + else: + # For testing, we iterate over classes instead of shuffling them + sort_idxs = [ + i + p * self.num_classes for i, c in enumerate(self.classes) for p in range(self.batches_per_class[c]) + ] + self.class_list = np.array(self.class_list)[np.argsort(sort_idxs)].tolist() + + def shuffle_data(self): + # Shuffle the examples per class + for c in self.classes: + perm = torch.randperm(self.indices_per_class[c].shape[0]) + self.indices_per_class[c] = self.indices_per_class[c][perm] + # Shuffle the class list from which we sample. Note that this way of shuffling + # does not prevent to choose the same class twice in a batch. However, for + # training and validation, this is not a problem. + random.shuffle(self.class_list) + + def __iter__(self): + # Shuffle data + if self.shuffle: + self.shuffle_data() + + # Sample few-shot batches + start_index = defaultdict(int) + for it in range(self.iterations): + class_batch = self.class_list[it * self.N_way : (it + 1) * self.N_way] # Select N classes for the batch + index_batch = [] + for c in class_batch: # For each class, select the next K examples and add them to the batch + index_batch.extend(self.indices_per_class[c][start_index[c] : start_index[c] + self.K_shot]) + start_index[c] += self.K_shot + if self.include_query: # If we return support+query set, sort them so that they are easy to split + index_batch = index_batch[::2] + index_batch[1::2] + yield index_batch + + def __len__(self): + return self.iterations + + +# %% [markdown] +# Now, we can create our intended data loaders by passing an object of `FewShotBatchSampler` as `batch_sampler=...` input to the PyTorch data loader object. +# For our experiments, we will use a 5-class 4-shot training setting. +# This means that each support set contains 5 classes with 4 examples each, i.e., 20 images overall. +# Usually, it is good to keep the number of shots equal to the number that you aim to test on. +# However, we will experiment later with different number of shots, and hence, we pick 4 as a compromise for now. +# To get the best performing model, it is recommended to consider the +# number of training shots as hyperparameter in a grid search. + +# %% +N_WAY = 5 +K_SHOT = 4 +train_data_loader = data.DataLoader( + train_set, + batch_sampler=FewShotBatchSampler(train_set.targets, include_query=True, N_way=N_WAY, K_shot=K_SHOT, shuffle=True), + num_workers=4, +) +val_data_loader = data.DataLoader( + val_set, + batch_sampler=FewShotBatchSampler( + val_set.targets, include_query=True, N_way=N_WAY, K_shot=K_SHOT, shuffle=False, shuffle_once=True + ), + num_workers=4, +) + +# %% [markdown] +# For simplicity, we implemented the sampling of a support and query set as sampling a support set with twice the number of examples. +# After sampling a batch from the data loader, we need to split it into a support and query set. +# We can summarize this step in the following function: + + +# %% +def split_batch(imgs, targets): + support_imgs, query_imgs = imgs.chunk(2, dim=0) + support_targets, query_targets = targets.chunk(2, dim=0) + return support_imgs, query_imgs, support_targets, query_targets + + +# %% [markdown] +# Finally, to ensure that our implementation of the data sampling process is correct, we can sample a batch and visualize its support and query set. +# What we would like to see is that the support and query set have the same classes, but distinct examples. + +# %% +imgs, targets = next(iter(val_data_loader)) # We use the validation set since it does not apply augmentations +support_imgs, query_imgs, _, _ = split_batch(imgs, targets) +support_grid = torchvision.utils.make_grid(support_imgs, nrow=K_SHOT, normalize=True, pad_value=0.9) +support_grid = support_grid.permute(1, 2, 0) +query_grid = torchvision.utils.make_grid(query_imgs, nrow=K_SHOT, normalize=True, pad_value=0.9) +query_grid = query_grid.permute(1, 2, 0) + +fig, ax = plt.subplots(1, 2, figsize=(8, 5)) +ax[0].imshow(support_grid) +ax[0].set_title("Support set") +ax[0].axis("off") +ax[1].imshow(query_grid) +ax[1].set_title("Query set") +ax[1].axis("off") +fig.suptitle("Few Shot Batch", weight="bold") +fig.show() +plt.close(fig) + +# %% [markdown] +# As we can see, the support and query set have the same five classes, but different examples. +# The models will be tasked to classify the examples in the query set by learning from the support set and its labels. +# With the data sampling in place, we can now start to implement our first meta-learning model: Prototypical Networks. + +# %% [markdown] +# ## Prototypical Networks +# +#
+ +# %% [markdown] +# The Prototypical Network, or ProtoNet for short, is a metric-based meta-learning algorithm which operates similar to a nearest neighbor classification. +# Metric-based meta-learning methods classify a new example $\mathbf{x}$ based on some distance function $d_{\varphi}$ between $x$ and all elements in the support set. +# ProtoNets implements this idea with the concept of prototypes in a learned feature space. +# First, ProtoNet uses an embedding function $f_{\theta}$ to encode each input in the support set into a $L$-dimensional feature vector. +# Next, for each class $c$, we collect the feature vectors of all examples with label $c$, and average their feature vectors. +# Formally, we can define this as: +# +# $$\mathbf{v}_c=\frac{1}{|S_c|}\sum_{(\mathbf{x}_i,y_i)\in S_c}f_{\theta}(\mathbf{x}_i)$$ +# +# where $S_c$ is the part of the support set $S$ for which $y_i=c$, and $\mathbf{v}_c$ represents the _prototype_ of class $c$. +# The prototype calculation is visualized below for a 2-dimensional feature space and 3 classes (Figure credit - [Snell et al.](https://arxiv.org/pdf/1703.05175.pdf)). +# The colored dots represent encoded support elements with color-corresponding class label, and the black dots next to the class label are the averaged prototypes. +# +#
+# +# Based on these prototypes, we want to classify a new example. +# Remember that since we want to learn the encoding function $f_{\theta}$, this classification must be differentiable and hence, we need to define a probability distribution across classes. +# For this, we will make use of the distance function $d_{\varphi}$: the closer a new example $\mathbf{x}$ is to a prototype $\mathbf{v}_c$, the higher the probability for $\mathbf{x}$ belonging to class $c$. +# Formally, we can simply use a softmax over the distances of $\mathbf{x}$ to all class prototypes: +# +# $$p(y=c\vert\mathbf{x})=\text{softmax}(-d_{\varphi}(f_{\theta}(\mathbf{x}), \mathbf{v}_c))=\frac{\exp\left(-d_{\varphi}(f_{\theta}(\mathbf{x}), \mathbf{v}_c)\right)}{\sum_{c'\in \mathcal{C}}\exp\left(-d_{\varphi}(f_{\theta}(\mathbf{x}), \mathbf{v}_{c'})\right)}$$ +# +# Note that the negative sign is necessary since we want to increase the probability for close-by vectors and have a low probability for distant vectors. +# We train the network $f_{\theta}$ based on the cross entropy error of the training query set examples. +# Thereby, the gradient flows through both the prototypes $\mathbf{v}_c$ and the query set encodings $f_{\theta}(\mathbf{x})$. +# For the distance function $d_{\varphi}$, we can choose any function as long as it is differentiable with respect to both of its inputs. +# The most common function, which we also use here, is the squared +# euclidean distance, but there has been several works on different +# distance functions as well. + +# %% [markdown] +# ### ProtoNet implementation + +# %% [markdown] +# Now that we know how a ProtoNet works in principle, let's look at how we can apply to our specific problem of few-shot image classification, and implement it below. +# First, we need to define the encoder function $f_{\theta}$. +# Since we work with CIFAR images, we can take a look back at Tutorial 5 where we compared common Computer Vision architectures, and choose one of the best performing ones. +# Here, we go with a DenseNet since it is in general more parameter efficient than ResNet. +# Luckily, we do not need to implement DenseNet ourselves again and can rely on torchvision's model package instead. +# We use common hyperparameters of 64 initial feature channels, add 32 per block, and use a bottleneck size of 64 (i.e. 2 times the growth rate). +# We use 4 stages of 6 layers each, which results in overall about 1 million parameters. +# Note that the torchvision package assumes that the last layer is used for classification and hence calls its output size `num_classes`. +# However, we can instead just use it as the feature space of ProtoNet, and choose an arbitrary dimensionality. +# We will use the same network for other algorithms in this notebook to ensure a fair comparison. + + +# %% +def get_convnet(output_size): + convnet = torchvision.models.DenseNet( + growth_rate=32, + block_config=(6, 6, 6, 6), + bn_size=2, + num_init_features=64, + num_classes=output_size, # Output dimensionality + ) + return convnet + + +# %% [markdown] +# Next, we can look at implementing ProtoNet. +# We will define it as PyTorch Lightning module to use all functionalities of PyTorch Lightning. +# The first step during training is to encode all images in a batch with our network. +# Next, we calculate the class prototypes from the support set (function `calculate_prototypes`), and classify the query set examples according to the prototypes (function `classify_feats`). +# Keep in mind that we use the data sampling described before, such that the support and query set are stacked together in the batch. +# Thus, we use our previously defined function `split_batch` to split them apart. +# The full code can be found below. + + +# %% +class ProtoNet(L.LightningModule): + def __init__(self, proto_dim, lr): + """ProtoNet. + + Args: + proto_dim: Dimensionality of prototype feature space + lr: Learning rate of Adam optimizer + """ + super().__init__() + self.save_hyperparameters() + self.model = get_convnet(output_size=self.hparams.proto_dim) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr) + scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[140, 180], gamma=0.1) + return [optimizer], [scheduler] + + @staticmethod + def calculate_prototypes(features, targets): + # Given a stack of features vectors and labels, return class prototypes + # features - shape [N, proto_dim], targets - shape [N] + classes, _ = torch.unique(targets).sort() # Determine which classes we have + prototypes = [] + for c in classes: + p = features[torch.where(targets == c)[0]].mean(dim=0) # Average class feature vectors + prototypes.append(p) + prototypes = torch.stack(prototypes, dim=0) + # Return the 'classes' tensor to know which prototype belongs to which class + return prototypes, classes + + def classify_feats(self, prototypes, classes, feats, targets): + # Classify new examples with prototypes and return classification error + dist = torch.pow(prototypes[None, :] - feats[:, None], 2).sum(dim=2) # Squared euclidean distance + preds = F.log_softmax(-dist, dim=1) + labels = (classes[None, :] == targets[:, None]).long().argmax(dim=-1) + acc = (preds.argmax(dim=1) == labels).float().mean() + return preds, labels, acc + + def calculate_loss(self, batch, mode): + # Determine training loss for a given support and query set + imgs, targets = batch + features = self.model(imgs) # Encode all images of support and query set + support_feats, query_feats, support_targets, query_targets = split_batch(features, targets) + prototypes, classes = ProtoNet.calculate_prototypes(support_feats, support_targets) + preds, labels, acc = self.classify_feats(prototypes, classes, query_feats, query_targets) + loss = F.cross_entropy(preds, labels) + + self.log("%s_loss" % mode, loss) + self.log("%s_acc" % mode, acc) + return loss + + def training_step(self, batch, batch_idx): + return self.calculate_loss(batch, mode="train") + + def validation_step(self, batch, batch_idx): + self.calculate_loss(batch, mode="val") + + +# %% [markdown] +# For validation, we use the same principle as training and sample support and query sets from the hold-out 10 classes. +# However, this gives us noisy scores depending on which query sets are chosen to which support sets. +# This is why we will use a different strategy during testing. +# For validation, our training strategy is sufficient since it is much +# faster than testing, and gives a good estimate of the training +# generalization as long as we keep the support-query sets constant across +# validation iterations. + +# %% [markdown] +# ### Training +# +# After implementing the model, we can already start training it. +# We use our common PyTorch Lightning training function, and train the model for 200 epochs. +# The training function takes `model_class` as input argument, i.e. the +# PyTorch Lightning module class that should be trained, since we will +# reuse this function for other algorithms as well. + + +# %% +def train_model(model_class, train_loader, val_loader, **kwargs): + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, model_class.__name__), + accelerator="auto", + devices=1, + max_epochs=200, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"), + LearningRateMonitor("epoch"), + ], + enable_progress_bar=False, + ) + trainer.logger._default_hp_metric = None + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, model_class.__name__ + ".ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model at %s, loading..." % pretrained_filename) + # Automatically loads the model with the saved hyperparameters + model = model_class.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) # To be reproducable + model = model_class(**kwargs) + trainer.fit(model, train_loader, val_loader) + model = model_class.load_from_checkpoint( + trainer.checkpoint_callback.best_model_path + ) # Load best checkpoint after training + + return model + + +# %% [markdown] +# Below is the training call for our ProtoNet. +# We use a 64-dimensional feature space. +# Larger feature spaces showed to give noisier results since the squared euclidean distance becomes proportionally larger in expectation, and smaller feature spaces might not allow for enough flexibility. +# We recommend to load the pre-trained model here at first, but feel free +# to play around with the hyperparameters yourself. + +# %% +protonet_model = train_model( + ProtoNet, proto_dim=64, lr=2e-4, train_loader=train_data_loader, val_loader=val_data_loader +) + +# %% [markdown] +# We can also take a closer look at the TensorBoard below. + +# %% +# Opens tensorboard in notebook. Adjust the path to your CHECKPOINT_PATH if needed +# # %tensorboard --logdir ../saved_models/tutorial16/tensorboards/ProtoNet/ + +# %% [markdown] +#
+# +# In contrast to standard supervised learning, we see that ProtoNet does not overfit as much as we would expect. +# The validation accuracy is of course lower than the average training, but the training loss does not stick close to zero. +# This is because no training batch is as the other, and we also mix new examples in the support set and query set. +# This gives us slightly different prototypes in every iteration, and makes it harder for the network to fully overfit. + +# %% [markdown] +# ### Testing +# +# Our goal of meta-learning is to obtain a model that can quickly adapt to a new task, or in this case, new classes to distinguish between. +# To test this, we will use our trained ProtoNet and adapt it to the 10 test classes. +# Thereby, we pick $k$ examples per class from which we determine the prototypes, and test the classification accuracy on all other examples. +# This can be seen as using the $k$ examples per class as support set, and the rest of the dataset as a query set. +# We iterate through the dataset such that each example has been once included in a support set. +# The average performance over all support sets tells us how well we can expect ProtoNet to perform when seeing only $k$ examples per class. +# During training, we used $k=4$. +# In testing, we will experiment with $k=\{2,4,8,16,32\}$ to get a better sense of how $k$ influences the results. +# We would expect that we achieve higher accuracies the more examples we have in the support set, but we don't know how it scales. +# Hence, let's first implement a function that executes the testing procedure for a given $k$: + + +# %% +@torch.no_grad() +def test_proto_net(model, dataset, data_feats=None, k_shot=4): + """Test proto net. + + Args: + model: Pretrained ProtoNet model + dataset: The dataset on which the test should be performed. + Should be instance of ImageDataset + data_feats: The encoded features of all images in the dataset. + If None, they will be newly calculated, and returned + for later usage. + k_shot: Number of examples per class in the support set. + """ + model = model.to(device) + model.eval() + num_classes = dataset.targets.unique().shape[0] + exmps_per_class = dataset.targets.shape[0] // num_classes # We assume uniform example distribution here + + # The encoder network remains unchanged across k-shot settings. Hence, we only need + # to extract the features for all images once. + if data_feats is None: + # Dataset preparation + dataloader = data.DataLoader(dataset, batch_size=128, num_workers=4, shuffle=False, drop_last=False) + + img_features = [] + img_targets = [] + for imgs, targets in tqdm(dataloader, "Extracting image features", leave=False): + imgs = imgs.to(device) + feats = model.model(imgs) + img_features.append(feats.detach().cpu()) + img_targets.append(targets) + img_features = torch.cat(img_features, dim=0) + img_targets = torch.cat(img_targets, dim=0) + # Sort by classes, so that we obtain tensors of shape [num_classes, exmps_per_class, ...] + # Makes it easier to process later + img_targets, sort_idx = img_targets.sort() + img_targets = img_targets.reshape(num_classes, exmps_per_class).transpose(0, 1) + img_features = img_features[sort_idx].reshape(num_classes, exmps_per_class, -1).transpose(0, 1) + else: + img_features, img_targets = data_feats + + # We iterate through the full dataset in two manners. First, to select the k-shot batch. + # Second, the evaluate the model on all other examples + accuracies = [] + for k_idx in tqdm(range(0, img_features.shape[0], k_shot), "Evaluating prototype classification", leave=False): + # Select support set and calculate prototypes + k_img_feats = img_features[k_idx : k_idx + k_shot].flatten(0, 1) + k_targets = img_targets[k_idx : k_idx + k_shot].flatten(0, 1) + prototypes, proto_classes = model.calculate_prototypes(k_img_feats, k_targets) + # Evaluate accuracy on the rest of the dataset + batch_acc = 0 + for e_idx in range(0, img_features.shape[0], k_shot): + if k_idx == e_idx: # Do not evaluate on the support set examples + continue + e_img_feats = img_features[e_idx : e_idx + k_shot].flatten(0, 1) + e_targets = img_targets[e_idx : e_idx + k_shot].flatten(0, 1) + _, _, acc = model.classify_feats(prototypes, proto_classes, e_img_feats, e_targets) + batch_acc += acc.item() + batch_acc /= img_features.shape[0] // k_shot - 1 + accuracies.append(batch_acc) + + return (mean(accuracies), stdev(accuracies)), (img_features, img_targets) + + +# %% [markdown] +# Testing ProtoNet is relatively quick if we have processed all images once. Hence, we can do in this notebook: + +# %% +protonet_accuracies = dict() +data_feats = None +for k in [2, 4, 8, 16, 32]: + protonet_accuracies[k], data_feats = test_proto_net(protonet_model, test_set, data_feats=data_feats, k_shot=k) + print( + "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)" + % (k, 100.0 * protonet_accuracies[k][0], 100 * protonet_accuracies[k][1]) + ) + +# %% [markdown] +# Before discussing the results above, let's first plot the accuracies over number of examples in the support set: + + +# %% +def plot_few_shot(acc_dict, name, color=None, ax=None): + sns.set() + if ax is None: + fig, ax = plt.subplots(1, 1, figsize=(5, 3)) + ks = sorted(list(acc_dict.keys())) + mean_accs = [acc_dict[k][0] for k in ks] + std_accs = [acc_dict[k][1] for k in ks] + ax.plot(ks, mean_accs, marker="o", markeredgecolor="k", markersize=6, label=name, color=color) + ax.fill_between( + ks, + [m - s for m, s in zip(mean_accs, std_accs)], + [m + s for m, s in zip(mean_accs, std_accs)], + alpha=0.2, + color=color, + ) + ax.set_xticks(ks) + ax.set_xlim([ks[0] - 1, ks[-1] + 1]) + ax.set_xlabel("Number of shots per class", weight="bold") + ax.set_ylabel("Accuracy", weight="bold") + if len(ax.get_title()) == 0: + ax.set_title("Few-Shot Performance " + name, weight="bold") + else: + ax.set_title(ax.get_title() + " and " + name, weight="bold") + ax.legend() + return ax + + +# %% +ax = plot_few_shot(protonet_accuracies, name="ProtoNet", color="C1") +plt.show() +plt.close() + +# %% [markdown] +# As we initially expected, the performance of ProtoNet indeed increases the more samples we have. +# However, even with just two samples per class, we classify almost half of the images correctly, which is well above random accuracy (10%). +# The curve shows an exponentially dampend trend, meaning that adding 2 extra examples to $k=2$ has a much higher impact than adding 2 extra samples if we already have $k=16$. +# Nonetheless, we can say that ProtoNet adapts fairly well to new classes. + +# %% [markdown] +# ## MAML and ProtoMAML +# +#
+ +# %% [markdown] +# The second meta-learning algorithm we will look at is MAML, short for Model-Agnostic Meta-Learning. +# MAML is an optimization-based meta-learning algorithm, which means that it tries to adjust the standard optimization procedure to a few-shot setting. +# The idea of MAML is relatively simple: given a model, support and query set during training, we optimize the model for $m$ steps on the support set, and evaluate the gradients of the query loss with respect to the original model's parameters. +# For the same model, we do it for a few different support-query sets and accumulate the gradients. +# This results in learning a model that provides a good initialization for being quickly adapted to the training tasks. +# If we denote the model parameters with $\theta$, we can visualize the procedure as follows (Figure credit - [Finn et al. ](http://proceedings.mlr.press/v70/finn17a.html)). +# +#
+ +# %% [markdown] +# The full algorithm of MAML is therefore as follows. +# At each training step, we sample a batch of tasks, i.e., a batch of support-query set pairs. +# For each task $\mathcal{T}_i$, we optimize a model $f_{\theta}$ on the support set via SGD, and denote this model as $f_{\theta_i'}$. +# We refer to this optimization as _inner loop_. +# Using this new model, we calculate the gradients of the original parameters, $\theta$, with respect to the query loss on $f_{\theta_i'}$. +# These gradients are accumulated over all tasks, and used to update $\theta$. +# This is called _outer loop_ since we iterate over tasks. +# The full MAML algorithm is summarized below (Figure credit - [Finn et al. ](http://proceedings.mlr.press/v70/finn17a.html)). +# +#
+ +# %% [markdown] +# To obtain gradients for the initial parameters $\theta$ from the optimized model $f_{\theta_i'}$, we actually need second-order gradients, i.e. gradients of gradients, as the support set gradients depend on $\theta$ as well. +# This makes MAML computationally expensive, especially when using mulitple inner loop steps. +# A simpler, yet almost equally well performing alternative is First-Order MAML (FOMAML) which only uses first-order gradients. +# This means that the second-order gradients are ignored, and we can calculate the outer loop gradients (line 10 in algorithm 2) simply by calculating the gradients with respect to $\theta_i'$, and use those as update to $\theta$. +# Hence, the new update rule becomes: +# $$\theta\leftarrow\theta-\beta\sum_{\mathcal{T}_i\sim p(\mathcal{T})}\nabla_{\theta_i'}\mathcal{L}_{\mathcal{T}_i}(f_{\theta_i'})$$ +# Note the change of $\theta$ to $\theta_i'$ for $\nabla$. + +# %% [markdown] +# ### ProtoMAML +# +# A problem of MAML is how to design the output classification layer. +# In case all tasks have different number of classes, we need to initialize the output layer with zeros or randomly in every iteration. +# Even if we always have the same number of classes, we just start from random predictions. +# This requires several inner loop steps to reach a reasonable classification result. +# To overcome this problem, Triantafillou et al. +# (2020) propose to combine the merits of Prototypical Networks and MAML. +# Specifically, we can use prototypes to initialize our output layer to have a strong initialization. +# Thereby, it can be shown that the softmax over euclidean distances can be reformulated as a linear layer with softmax. +# To see this, let's first write out the negative euclidean distance between a feature vector $f_{\theta}(\mathbf{x}^{*})$ of a new data point $\mathbf{x}^{*}$ to a prototype $\mathbf{v}_c$ of class $c$: +# $$ +# -||f_{\theta}(\mathbf{x}^{*})-\mathbf{v}_c||^2=-f_{\theta}(\mathbf{x}^{*})^Tf_{\theta}(\mathbf{x}^{*})+2\mathbf{v}_c^{T}f_{\theta}(\mathbf{x}^{*})-\mathbf{v}_c^T\mathbf{v}_c +# $$ +# +# We perform the classification across all classes $c\in\mathcal{C}$ and take a softmax on the distance. +# Hence, any term that is same for all classes can be removed without changing the output probabilities. +# In the equation above, this is true for $-f_{\theta}(\mathbf{x}^{*})^Tf_{\theta}(\mathbf{x}^{*})$ since it is independent of any class prototype. +# Thus, we can write: +# +# $$ +# -||f_{\theta}(\mathbf{x}^{*})-\mathbf{v}_c||^2=2\mathbf{v}_c^{T}f_{\theta}(\mathbf{x}^{*})-||\mathbf{v}_c||^2+\text{constant} +# $$ +# +# Taking a second look at the equation above, it looks a lot like a linear layer. +# For this, we use $\mathbf{W}_{c,\cdot}=2\mathbf{v}_c$ and $b_c=-||\mathbf{v}_c||^2$ which gives us the linear layer $\mathbf{W}f_{\theta}(\mathbf{x}^{*})+\mathbf{b}$. +# Hence, if we initialize the output weight with twice the prototypes, and the biases by the negative squared L2 norm of the prototypes, we start with a Prototypical Network. +# MAML allows us to adapt this layer and the rest of the network further. +# +# In the following, we will implement First-Order ProtoMAML for few-shot classification. +# The implementation of MAML would be the same except the output layer initialization. + +# %% [markdown] +# ### ProtoMAML implementation +# +# For implementing ProtoMAML, we can follow Algorithm 2 with minor modifications. +# At each training step, we first sample a batch of tasks, and a support and query set for each task. +# In our case of few-shot classification, this means that we simply sample multiple support-query set pairs from our sampler. +# For each task, we finetune our current model on the support set. +# However, since we need to remember the original parameters for the other tasks, the outer loop gradient update and future training steps, we need to create a copy of our model, and finetune only the copy. +# We can copy a model by using standard Python functions like `deepcopy`. +# The inner loop is implemented in the function `adapt_few_shot` in the PyTorch Lightning module below. +# +# After finetuning the model, we apply it on the query set and calculate the first-order gradients with respect to the original parameters $\theta$. +# In contrast to simple MAML, we also have to consider the gradients with respect to the output layer initialization, i.e. the prototypes, since they directly rely on $\theta$. +# To realize this efficiently, we take two steps. +# First, we calculate the prototypes by applying the original model, i.e. not the copied model, on the support elements. +# When initializing the output layer, we detach the prototypes to stop the gradients. +# This is because in the inner loop itself, we do not want to consider gradients through the prototypes back to the original model. +# However, after the inner loop is finished, we re-attach the computation graph of the prototypes by writing `output_weight = (output_weight - init_weight).detach() + init_weight`. +# While this line does not change the value of the variable `output_weight`, it adds its dependency on the prototype initialization `init_weight`. +# Thus, if we call `.backward` on `output_weight`, we will automatically calculate the first-order gradients with respect to the prototype initialization in the original model. +# +# After calculating all gradients and summing them together in the original model, we can take a standard optimizer step. +# PyTorch Lightning's method is however designed to return a loss-tensor on which we call `.backward` first. +# Since this is not possible here, we need to perform the optimization step ourselves. +# All details can be found in the code below. +# +# For implementing (Proto-)MAML with second-order gradients, it is recommended to use libraries such as [$\nabla$higher](https://github.com/facebookresearch/higher) from Facebook AI Research. +# For simplicity, we stick with first-order methods here. + + +# %% +class ProtoMAML(L.LightningModule): + def __init__(self, proto_dim, lr, lr_inner, lr_output, num_inner_steps): + """ProtoMAML. + + Args: + proto_dim: Dimensionality of prototype feature space + lr: Learning rate of the outer loop Adam optimizer + lr_inner: Learning rate of the inner loop SGD optimizer + lr_output: Learning rate for the output layer in the inner loop + num_inner_steps: Number of inner loop updates to perform + """ + super().__init__() + self.save_hyperparameters() + self.model = get_convnet(output_size=self.hparams.proto_dim) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr) + scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[140, 180], gamma=0.1) + return [optimizer], [scheduler] + + def run_model(self, local_model, output_weight, output_bias, imgs, labels): + # Execute a model with given output layer weights and inputs + feats = local_model(imgs) + preds = F.linear(feats, output_weight, output_bias) + loss = F.cross_entropy(preds, labels) + acc = (preds.argmax(dim=1) == labels).float() + return loss, preds, acc + + def adapt_few_shot(self, support_imgs, support_targets): + # Determine prototype initialization + support_feats = self.model(support_imgs) + prototypes, classes = ProtoNet.calculate_prototypes(support_feats, support_targets) + support_labels = (classes[None, :] == support_targets[:, None]).long().argmax(dim=-1) + # Create inner-loop model and optimizer + local_model = deepcopy(self.model) + local_model.train() + local_optim = optim.SGD(local_model.parameters(), lr=self.hparams.lr_inner) + local_optim.zero_grad() + # Create output layer weights with prototype-based initialization + init_weight = 2 * prototypes + init_bias = -torch.norm(prototypes, dim=1) ** 2 + output_weight = init_weight.detach().requires_grad_() + output_bias = init_bias.detach().requires_grad_() + + # Optimize inner loop model on support set + for _ in range(self.hparams.num_inner_steps): + # Determine loss on the support set + loss, _, _ = self.run_model(local_model, output_weight, output_bias, support_imgs, support_labels) + # Calculate gradients and perform inner loop update + loss.backward() + local_optim.step() + # Update output layer via SGD + output_weight.data -= self.hparams.lr_output * output_weight.grad + output_bias.data -= self.hparams.lr_output * output_bias.grad + # Reset gradients + local_optim.zero_grad() + output_weight.grad.fill_(0) + output_bias.grad.fill_(0) + + # Re-attach computation graph of prototypes + output_weight = (output_weight - init_weight).detach() + init_weight + output_bias = (output_bias - init_bias).detach() + init_bias + + return local_model, output_weight, output_bias, classes + + def outer_loop(self, batch, mode="train"): + accuracies = [] + losses = [] + self.model.zero_grad() + + # Determine gradients for batch of tasks + for task_batch in batch: + imgs, targets = task_batch + support_imgs, query_imgs, support_targets, query_targets = split_batch(imgs, targets) + # Perform inner loop adaptation + local_model, output_weight, output_bias, classes = self.adapt_few_shot(support_imgs, support_targets) + # Determine loss of query set + query_labels = (classes[None, :] == query_targets[:, None]).long().argmax(dim=-1) + loss, preds, acc = self.run_model(local_model, output_weight, output_bias, query_imgs, query_labels) + # Calculate gradients for query set loss + if mode == "train": + loss.backward() + + for p_global, p_local in zip(self.model.parameters(), local_model.parameters()): + p_global.grad += p_local.grad # First-order approx. -> add gradients of finetuned and base model + + accuracies.append(acc.mean().detach()) + losses.append(loss.detach()) + + # Perform update of base model + if mode == "train": + opt = self.optimizers() + opt.step() + opt.zero_grad() + + self.log("%s_loss" % mode, sum(losses) / len(losses)) + self.log("%s_acc" % mode, sum(accuracies) / len(accuracies)) + + def training_step(self, batch, batch_idx): + self.outer_loop(batch, mode="train") + return None # Returning None means we skip the default training optimizer steps by PyTorch Lightning + + def validation_step(self, batch, batch_idx): + # Validation requires to finetune a model, hence we need to enable gradients + torch.set_grad_enabled(True) + self.outer_loop(batch, mode="val") + torch.set_grad_enabled(False) + + +# %% [markdown] +# ### Training +# +# To train ProtoMAML, we need to change our sampling slightly. +# Instead of a single support-query set batch, we need to sample multiple. +# To implement this, we yet use another Sampler which combines multiple batches from a `FewShotBatchSampler`, and returns it afterwards. +# Additionally, we define a `collate_fn` for our data loader which takes the stack of support-query set images, and returns the tasks as a list. +# This makes it easier to process in our PyTorch Lightning module before. +# The implementation of the sampler can be found below. + + +# %% +class TaskBatchSampler: + def __init__(self, dataset_targets, batch_size, N_way, K_shot, include_query=False, shuffle=True): + """Task Batch Sampler. + + Args: + dataset_targets: PyTorch tensor of the labels of the data elements. + batch_size: Number of tasks to aggregate in a batch + N_way: Number of classes to sample per batch. + K_shot: Number of examples to sample per class in the batch. + include_query: If True, returns batch of size N_way*K_shot*2, which + can be split into support and query set. Simplifies + the implementation of sampling the same classes but + distinct examples for support and query set. + shuffle: If True, examples and classes are newly shuffled in each + iteration (for training) + """ + super().__init__() + self.batch_sampler = FewShotBatchSampler(dataset_targets, N_way, K_shot, include_query, shuffle) + self.task_batch_size = batch_size + self.local_batch_size = self.batch_sampler.batch_size + + def __iter__(self): + # Aggregate multiple batches before returning the indices + batch_list = [] + for batch_idx, batch in enumerate(self.batch_sampler): + batch_list.extend(batch) + if (batch_idx + 1) % self.task_batch_size == 0: + yield batch_list + batch_list = [] + + def __len__(self): + return len(self.batch_sampler) // self.task_batch_size + + def get_collate_fn(self): + # Returns a collate function that converts one big tensor into a list of task-specific tensors + def collate_fn(item_list): + imgs = torch.stack([img for img, target in item_list], dim=0) + targets = torch.stack([target for img, target in item_list], dim=0) + imgs = imgs.chunk(self.task_batch_size, dim=0) + targets = targets.chunk(self.task_batch_size, dim=0) + return list(zip(imgs, targets)) + + return collate_fn + + +# %% [markdown] +# The creation of the data loaders is with this sampler straight-forward. +# Note that since many images need to loaded for a training batch, it is recommended to use less workers than usual. + +# %% +# Training constant (same as for ProtoNet) +N_WAY = 5 +K_SHOT = 4 + +# Training set +train_protomaml_sampler = TaskBatchSampler( + train_set.targets, include_query=True, N_way=N_WAY, K_shot=K_SHOT, batch_size=16 +) +train_protomaml_loader = data.DataLoader( + train_set, batch_sampler=train_protomaml_sampler, collate_fn=train_protomaml_sampler.get_collate_fn(), num_workers=2 +) + +# Validation set +val_protomaml_sampler = TaskBatchSampler( + val_set.targets, + include_query=True, + N_way=N_WAY, + K_shot=K_SHOT, + batch_size=1, # We do not update the parameters, hence the batch size is irrelevant here + shuffle=False, +) +val_protomaml_loader = data.DataLoader( + val_set, batch_sampler=val_protomaml_sampler, collate_fn=val_protomaml_sampler.get_collate_fn(), num_workers=2 +) + +# %% [markdown] +# Now, we are ready to train our ProtoMAML. +# We use the same feature space size as for ProtoNet, but can use a higher learning rate since the outer loop gradients are accumulated over 16 batches. +# The inner loop learning rate is set to 0.1, which is much higher than the outer loop lr because we use SGD in the inner loop instead of Adam. +# Commonly, the learning rate for the output layer is higher than the base model is the base model is very deep or pre-trained. +# However, for our setup, we observed no noticable impact of using a different learning rate than the base model. +# The number of inner loop updates is another crucial hyperparmaeter, and depends on the similarity of our training tasks. +# Since all tasks are on images from the same dataset, we notice that a single inner loop update achieves similar performance as 3 or 5 while training considerably faster. +# However, especially in RL and NLP, larger number of inner loop steps are often needed. + +# %% +protomaml_model = train_model( + ProtoMAML, + proto_dim=64, + lr=1e-3, + lr_inner=0.1, + lr_output=0.1, + num_inner_steps=1, # Often values between 1 and 10 + train_loader=train_protomaml_loader, + val_loader=val_protomaml_loader, +) + +# %% [markdown] +# Let's have a look at the training TensorBoard. + +# %% +# Opens tensorboard in notebook. Adjust the path to your CHECKPOINT_PATH if needed +# # %tensorboard --logdir ../saved_models/tutorial16/tensorboards/ProtoMAML/ + +# %% [markdown] +#
+# +# One obvious difference to ProtoNet is that the loss curves look much less noisy. +# This is because we average the outer loop gradients over multiple tasks, and thus have a smoother training curve. +# Additionally, we only have 15k training iterations after 200 epochs. +# This is again because of the task batches, which cause 16 times less iterations. +# However, each iteration has seen 16 times more data in this experiment. +# Thus, we still have a fair comparison between ProtoMAML and ProtoNet. +# At first sight on the validation accuracy, one would assume that +# ProtoNet performs superior to ProtoMAML, but we have to verify that with +# proper testing below. + +# %% [markdown] +# ### Testing +# +# We test ProtoMAML in the same manner as ProtoNet, namely by picking random examples in the test set as support sets and use the rest of the dataset as query set. +# Instead of just calculating the prototypes for all examples, we need to finetune a separate model for each support set. +# This is why this process is more expensive than ProtoNet, and in our case, testing $k=\{2,4,8,16,32\}$ can take almost an hour. +# Hence, we provide evaluation files besides the pretrained models. + + +# %% +def test_protomaml(model, dataset, k_shot=4): + L.seed_everything(42) + model = model.to(device) + num_classes = dataset.targets.unique().shape[0] + + # Data loader for full test set as query set + full_dataloader = data.DataLoader(dataset, batch_size=128, num_workers=4, shuffle=False, drop_last=False) + # Data loader for sampling support sets + sampler = FewShotBatchSampler( + dataset.targets, include_query=False, N_way=num_classes, K_shot=k_shot, shuffle=False, shuffle_once=False + ) + sample_dataloader = data.DataLoader(dataset, batch_sampler=sampler, num_workers=2) + + # We iterate through the full dataset in two manners. First, to select the k-shot batch. + # Second, the evaluate the model on all other examples + accuracies = [] + for (support_imgs, support_targets), support_indices in tqdm( + zip(sample_dataloader, sampler), "Performing few-shot finetuning" + ): + support_imgs = support_imgs.to(device) + support_targets = support_targets.to(device) + # Finetune new model on support set + local_model, output_weight, output_bias, classes = model.adapt_few_shot(support_imgs, support_targets) + with torch.no_grad(): # No gradients for query set needed + local_model.eval() + batch_acc = torch.zeros((0,), dtype=torch.float32, device=device) + # Evaluate all examples in test dataset + for query_imgs, query_targets in full_dataloader: + query_imgs = query_imgs.to(device) + query_targets = query_targets.to(device) + query_labels = (classes[None, :] == query_targets[:, None]).long().argmax(dim=-1) + _, _, acc = model.run_model(local_model, output_weight, output_bias, query_imgs, query_labels) + batch_acc = torch.cat([batch_acc, acc.detach()], dim=0) + # Exclude support set elements + for s_idx in support_indices: + batch_acc[s_idx] = 0 + batch_acc = batch_acc.sum().item() / (batch_acc.shape[0] - len(support_indices)) + accuracies.append(batch_acc) + return mean(accuracies), stdev(accuracies) + + +# %% [markdown] +# In contrast to training, it is recommended to use many more inner loop updates during testing. +# During training, we are not interested in getting the best model from the inner loop, but the model which can provide the best gradients. +# Hence, one update might be already sufficient in training, but for testing, it was often observed that larger number of updates can give a considerable performance boost. +# Thus, we change the inner loop updates to 200 before testing. + +# %% +protomaml_model.hparams.num_inner_steps = 200 + +# %% [markdown] +# Now, we can test our model. +# For the pre-trained models, we provide a json file with the results to reduce evaluation time. + +# %% +protomaml_result_file = os.path.join(CHECKPOINT_PATH, "protomaml_fewshot.json") + +if os.path.isfile(protomaml_result_file): + # Load pre-computed results + with open(protomaml_result_file) as f: + protomaml_accuracies = json.load(f) + protomaml_accuracies = {int(k): v for k, v in protomaml_accuracies.items()} +else: + # Perform same experiments as for ProtoNet + protomaml_accuracies = dict() + for k in [2, 4, 8, 16, 32]: + protomaml_accuracies[k] = test_protomaml(protomaml_model, test_set, k_shot=k) + # Export results + with open(protomaml_result_file, "w") as f: + json.dump(protomaml_accuracies, f, indent=4) + +for k in protomaml_accuracies: + print( + "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)" + % (k, 100.0 * protomaml_accuracies[k][0], 100.0 * protomaml_accuracies[k][1]) + ) + +# %% [markdown] +# Again, let's plot the results in our plot from before. + +# %% +ax = plot_few_shot(protonet_accuracies, name="ProtoNet", color="C1") +plot_few_shot(protomaml_accuracies, name="ProtoMAML", color="C2", ax=ax) +plt.show() +plt.close() + +# %% [markdown] +# We can observe that ProtoMAML is indeed able to outperform ProtoNet for $k>4$. +# This is because with more samples, it becomes more relevant to also adapt the base model's parameters. +# Meanwhile, for $k=2$, ProtoMAML achieves lower performance than ProtoNet. +# This is likely also related to choosing 200 inner loop updates since with more updates, there exists the risk of overfitting. +# Nonetheless, the high standard deviation for $k=2$ makes it hard to take any statistically valid conclusion. +# +# Overall, we can conclude that ProtoMAML slightly outperforms ProtoNet for larger shot counts. +# However, one disadvantage of ProtoMAML is its much longer training and testing time. +# ProtoNet provides a simple, efficient, yet strong baseline for +# ProtoMAML, and might be the better solution in situations where limited +# resources are available. + +# %% [markdown] +# ## Domain adaptation +# +# So far, we have evaluated our meta-learning algorithms on the same dataset on which we have trained them. +# However, meta-learning algorithms are especially interesting when we want to move from one to another dataset. +# So, what happens if we apply them on a quite different dataset than CIFAR? +# This is what we try out below, and evaluate ProtoNet and ProtoMAML on the SVHN dataset. + +# %% [markdown] +# ### SVHN dataset +# +# The Street View House Numbers (SVHN) dataset is a real-world image dataset for house number detection. +# It is similar to MNIST by having the classes 0 to 9, but is more difficult due to its real-world setting and possible distracting numbers left and right. +# Let's first load the dataset, and visualize some images to get an impression of the dataset. + +# %% +SVHN_test_dataset = SVHN(root=DATASET_PATH, split="test", download=True, transform=transforms.ToTensor()) + +# %% +# Visualize some examples +NUM_IMAGES = 12 +SVHN_images = [SVHN_test_dataset[np.random.randint(len(SVHN_test_dataset))][0] for idx in range(NUM_IMAGES)] +SVHN_images = torch.stack(SVHN_images, dim=0) +img_grid = torchvision.utils.make_grid(SVHN_images, nrow=6, normalize=True, pad_value=0.9) +img_grid = img_grid.permute(1, 2, 0) + +plt.figure(figsize=(8, 8)) +plt.title("Image examples of the SVHN dataset") +plt.imshow(img_grid) +plt.axis("off") +plt.show() +plt.close() + +# %% [markdown] +# Each image is labeled with one class between 0 and 9 representing the main digit in the image. +# Can our ProtoNet and ProtoMAML learn to classify the digits from only a few examples? +# This is what we will test out below. +# The images have the same size as CIFAR, so that we can use the images without changes. +# We first prepare the dataset, for which we take the first 500 images per class. +# For this dataset, we use our test functions as before to get an estimated performance for different number of shots. + +# %% +imgs = np.transpose(SVHN_test_dataset.data, (0, 2, 3, 1)) +targets = SVHN_test_dataset.labels +# Limit number of examples to 500 to reduce test time +min_label_count = min(500, np.bincount(SVHN_test_dataset.labels).min()) + +idxs = np.concatenate([np.where(targets == c)[0][:min_label_count] for c in range(1 + targets.max())], axis=0) +imgs = imgs[idxs] +targets = torch.from_numpy(targets[idxs]).long() + +svhn_fewshot_dataset = ImageDataset(imgs, targets, img_transform=test_transform) +svhn_fewshot_dataset.imgs.shape + +# %% [markdown] +# ### Experiments +# +# First, we can apply ProtoNet to the SVHN dataset: + +# %% +protonet_svhn_accuracies = dict() +data_feats = None +for k in [2, 4, 8, 16, 32]: + protonet_svhn_accuracies[k], data_feats = test_proto_net( + protonet_model, svhn_fewshot_dataset, data_feats=data_feats, k_shot=k + ) + print( + "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)" + % (k, 100.0 * protonet_svhn_accuracies[k][0], 100 * protonet_svhn_accuracies[k][1]) + ) + +# %% [markdown] +# It becomes clear that the results are much lower than the ones on CIFAR, and just slightly above random for $k=2$. +# How about ProtoMAML? +# We provide again evaluation files since the evaluation can take several minutes to complete. + +# %% +protomaml_result_file = os.path.join(CHECKPOINT_PATH, "protomaml_svhn_fewshot.json") + +if os.path.isfile(protomaml_result_file): + # Load pre-computed results + with open(protomaml_result_file) as f: + protomaml_svhn_accuracies = json.load(f) + protomaml_svhn_accuracies = {int(k): v for k, v in protomaml_svhn_accuracies.items()} +else: + # Perform same experiments as for ProtoNet + protomaml_svhn_accuracies = dict() + for k in [2, 4, 8, 16, 32]: + protomaml_svhn_accuracies[k] = test_protomaml(protomaml_model, svhn_fewshot_dataset, k_shot=k) + # Export results + with open(protomaml_result_file, "w") as f: + json.dump(protomaml_svhn_accuracies, f, indent=4) + +for k in protomaml_svhn_accuracies: + print( + "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)" + % (k, 100.0 * protomaml_svhn_accuracies[k][0], 100.0 * protomaml_svhn_accuracies[k][1]) + ) + +# %% [markdown] +# While ProtoMAML shows similar performance than ProtoNet for $k\leq 4$, it considerably outperforms ProtoNet for more than 8 shots. +# This is because we can adapt the base model, which is crucial when the data does not fit the original training data. +# For $k=32$, ProtoMAML achieves $13\%$ higher classification accuracy than ProtoNet which already starts to flatten out. +# We can see the trend more clearly in our plot below. + +# %% +ax = plot_few_shot(protonet_svhn_accuracies, name="ProtoNet", color="C1") +plot_few_shot(protomaml_svhn_accuracies, name="ProtoMAML", color="C2", ax=ax) +plt.show() +plt.close() + +# %% [markdown] +# ## Conclusion +# +# In this notebook, we have discussed meta-learning algorithms that learn to adapt to new classes and/or tasks with just a few samples. +# We have discussed three popular algorithms, namely ProtoNet, MAML and ProtoMAML. +# On the few-shot image classification task of CIFAR100, ProtoNet and ProtoMAML showed to perform similarly well, with slight benefits of ProtoMAML for larger shot sizes. +# However, for out-of-distribution data (SVHN), the ability to optimize the base model showed to be crucial and gave ProtoMAML considerable performance gains over ProtoNet. +# Nonetheless, ProtoNet offers other advantages compared to ProtoMAML, namely a very cheap training and test cost as well as a simpler implementation. +# Hence, it is recommended to consider whether the additionally complexity +# of ProtoMAML is worth the extra training computation cost, or whether +# ProtoNet is already sufficient for the task at hand. + +# %% [markdown] +# ### References +# +# [1] Snell, Jake, Kevin Swersky, and Richard S. Zemel. +# "Prototypical networks for few-shot learning." +# NeurIPS 2017. +# ([link](https://arxiv.org/pdf/1703.05175.pdf)) +# +# [2] Chelsea Finn, Pieter Abbeel, Sergey Levine. +# "Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks." +# ICML 2017. +# ([link](http://proceedings.mlr.press/v70/finn17a.html)) +# +# [3] Triantafillou, Eleni, Tyler Zhu, Vincent Dumoulin, Pascal Lamblin, Utku Evci, Kelvin Xu, Ross Goroshin et al. +# "Meta-dataset: A dataset of datasets for learning to learn from few examples." +# ICLR 2020. +# ([link](https://openreview.net/pdf?id=rkgAGAVKPr)) diff --git a/course_UvA-DL/12-meta-learning/few-shot-classification.png b/course_UvA-DL/12-meta-learning/few-shot-classification.png new file mode 100644 index 0000000..d0146d1 Binary files /dev/null and b/course_UvA-DL/12-meta-learning/few-shot-classification.png differ diff --git a/course_UvA-DL/12-meta-learning/protonet_classification.svg b/course_UvA-DL/12-meta-learning/protonet_classification.svg new file mode 100644 index 0000000..716fbe3 --- /dev/null +++ b/course_UvA-DL/12-meta-learning/protonet_classification.svg @@ -0,0 +1,612 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/12-meta-learning/tensorboard_screenshot_ProtoMAML.png b/course_UvA-DL/12-meta-learning/tensorboard_screenshot_ProtoMAML.png new file mode 100644 index 0000000..13f2993 Binary files /dev/null and b/course_UvA-DL/12-meta-learning/tensorboard_screenshot_ProtoMAML.png differ diff --git a/course_UvA-DL/12-meta-learning/tensorboard_screenshot_ProtoNet.png b/course_UvA-DL/12-meta-learning/tensorboard_screenshot_ProtoNet.png new file mode 100644 index 0000000..e420ef8 Binary files /dev/null and b/course_UvA-DL/12-meta-learning/tensorboard_screenshot_ProtoNet.png differ diff --git a/course_UvA-DL/13-contrastive-learning/.meta.yml b/course_UvA-DL/13-contrastive-learning/.meta.yml new file mode 100644 index 0000000..6f9832f --- /dev/null +++ b/course_UvA-DL/13-contrastive-learning/.meta.yml @@ -0,0 +1,26 @@ +title: "Tutorial 13: Self-Supervised Contrastive Learning with SimCLR" +author: Phillip Lippe +created: 2021-08-30 +updated: 2023-03-14 +license: CC BY-SA +tags: + - Image + - Self-Supervised + - Contrastive-Learning +description: | + In this tutorial, we will take a closer look at self-supervised contrastive learning. + Self-supervised learning, or also sometimes called unsupervised learning, describes the scenario where we have given input data, but no accompanying labels to train in a classical supervised way. + However, this data still contains a lot of information from which we can learn: how are the images different from each other? + What patterns are descriptive for certain images? + Can we cluster the images? + To get an insight into these questions, we will implement a popular, simple contrastive learning method, SimCLR, and apply it to the STL10 dataset. + This notebook is part of a lecture series on Deep Learning at the University of Amsterdam. + The full list of tutorials can be found at https://uvadlc-notebooks.rtfd.io. +requirements: + - torchvision + - matplotlib + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/course_UvA-DL/13-contrastive-learning/.thumb.jpg b/course_UvA-DL/13-contrastive-learning/.thumb.jpg new file mode 100644 index 0000000..6e05cce Binary files /dev/null and b/course_UvA-DL/13-contrastive-learning/.thumb.jpg differ diff --git a/course_UvA-DL/13-contrastive-learning/SimCLR.py b/course_UvA-DL/13-contrastive-learning/SimCLR.py new file mode 100644 index 0000000..1bc97bb --- /dev/null +++ b/course_UvA-DL/13-contrastive-learning/SimCLR.py @@ -0,0 +1,841 @@ +# %% [markdown] +#
+# Methods for self-supervised learning try to learn as much as possible from the data alone, so it can quickly be finetuned for a specific classification task. +# The benefit of self-supervised learning is that a large dataset can often easily be obtained. +# For instance, if we want to train a vision model on semantic segmentation for autonomous driving, we can collect large amounts of data by simply installing a camera in a car, and driving through a city for an hour. +# In contrast, if we would want to do supervised learning, we would have to manually label all those images before training a model. +# This is extremely expensive, and would likely take a couple of months to manually label the same amount of data. +# Further, self-supervised learning can provide an alternative to transfer learning from models pretrained on ImageNet since we could pretrain a model on a specific dataset/situation, e.g. traffic scenarios for autonomous driving. +# +# Within the last two years, a lot of new approaches have been proposed for self-supervised learning, in particular for images, that have resulted in great improvements over supervised models when few labels are available. +# The subfield that we will focus on in this tutorial is contrastive learning. +# Contrastive learning is motivated by the question mentioned above: how are images different from each other? +# Specifically, contrastive learning methods train a model to cluster an image and its slightly augmented version in latent space, while the distance to other images should be maximized. +# A very recent and simple method for this is [SimCLR](https://arxiv.org/abs/2006.10029), which is visualized below (figure credit - [Ting Chen et al. ](https://simclr.github.io/)). +# +#
![simclr contrastive learning](simclr_contrastive_learning.png){width="500px"}
+# +# The general setup is that we are given a dataset of images without any labels, and want to train a model on this data such that it can quickly adapt to any image recognition task afterward. +# During each training iteration, we sample a batch of images as usual. +# For each image, we create two versions by applying data augmentation techniques like cropping, Gaussian noise, blurring, etc. +# An example of such is shown on the left with the image of the dog. +# We will go into the details and effects of the chosen augmentation techniques later. +# On those images, we apply a CNN like ResNet and obtain as output a 1D feature vector on which we apply a small MLP. +# The output features of the two augmented images are then trained to be close to each other, while all other images in that batch should be as different as possible. +# This way, the model has to learn to recognize the content of the image that remains unchanged under the data augmentations, such as objects which we usually care about in supervised tasks. +# +# We will now implement this framework ourselves and discuss further details along the way. +# Let's first start with importing our standard libraries below: + +# %% +import os +import urllib.request +from copy import deepcopy +from urllib.error import HTTPError + +import lightning as L +import matplotlib +import matplotlib.pyplot as plt +import matplotlib_inline.backend_inline +import seaborn as sns +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data as data +import torchvision +from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint +from torchvision import transforms +from torchvision.datasets import STL10 +from tqdm.notebook import tqdm + +plt.set_cmap("cividis") +# %matplotlib inline +matplotlib_inline.backend_inline.set_matplotlib_formats("svg", "pdf") # For export +matplotlib.rcParams["lines.linewidth"] = 2.0 +sns.set() + +# Import tensorboard +# %load_ext tensorboard + +# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10) +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") +# Path to the folder where the pretrained models are saved +CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/ContrastiveLearning/") +# In this notebook, we use data loaders with heavier computational processing. It is recommended to use as many +# workers as possible in a data loader, which corresponds to the number of CPU cores +NUM_WORKERS = os.cpu_count() + +# Setting the seed +L.seed_everything(42) + +# Ensure that all operations are deterministic on GPU (if used) for reproducibility +torch.backends.cudnn.determinstic = True +torch.backends.cudnn.benchmark = False + +device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") +print("Device:", device) +print("Number of workers:", NUM_WORKERS) + +# %% [markdown] +# As in many tutorials before, we provide pre-trained models. +# Note that those models are slightly larger as normal (~100MB overall) since we use the default ResNet-18 architecture. +# If you are running this notebook locally, make sure to have sufficient disk space available. + +# %% +# Github URL where saved models are stored for this tutorial +base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial17/" +# Files to download +pretrained_files = [ + "SimCLR.ckpt", + "ResNet.ckpt", + "tensorboards/SimCLR/events.out.tfevents.SimCLR", + "tensorboards/classification/ResNet/events.out.tfevents.ResNet", +] +pretrained_files += [f"LogisticRegression_{size}.ckpt" for size in [10, 20, 50, 100, 200, 500]] +# Create checkpoint path if it doesn't exist yet +os.makedirs(CHECKPOINT_PATH, exist_ok=True) + +# For each file, check whether it already exists. If not, try downloading it. +for file_name in pretrained_files: + file_path = os.path.join(CHECKPOINT_PATH, file_name) + if "/" in file_name: + os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True) + if not os.path.isfile(file_path): + file_url = base_url + file_name + print(f"Downloading {file_url}...") + try: + urllib.request.urlretrieve(file_url, file_path) + except HTTPError as e: + print( + "Something went wrong. Please try to download the file from the GDrive folder, or contact the author with the full output including the following error:\n", + e, + ) + +# %% [markdown] +# ## SimCLR +# +# We will start our exploration of contrastive learning by discussing the effect of different data augmentation techniques, and how we can implement an efficient data loader for such. +# Next, we implement SimCLR with PyTorch Lightning, and finally train it on a large, unlabeled dataset. + +# %% [markdown] +# ### Data Augmentation for Contrastive Learning +# +# To allow efficient training, we need to prepare the data loading such that we sample two different, random augmentations for each image in the batch. +# The easiest way to do this is by creating a transformation that, when being called, applies a set of data augmentations to an image twice. +# This is implemented in the class `ContrastiveTransformations` below: + + +# %% +class ContrastiveTransformations: + def __init__(self, base_transforms, n_views=2): + self.base_transforms = base_transforms + self.n_views = n_views + + def __call__(self, x): + return [self.base_transforms(x) for i in range(self.n_views)] + + +# %% [markdown] +# The contrastive learning framework can easily be extended to have more _positive_ examples by sampling more than two augmentations of the same image. +# However, the most efficient training is usually obtained by using only two. +# +# Next, we can look at the specific augmentations we want to apply. +# The choice of the data augmentation to use is the most crucial hyperparameter in SimCLR since it directly affects how the latent space is structured, and what patterns might be learned from the data. +# Let's first take a look at some of the most popular data augmentations (figure credit - [Ting Chen and Geoffrey Hinton](https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html)): +# +#
+# +# All of them can be used, but it turns out that two augmentations stand out in their importance: crop-and-resize, and color distortion. +# Interestingly, however, they only lead to strong performance if they have been used together as discussed by [Ting Chen et al. ](https://arxiv.org/abs/2006.10029) in their SimCLR paper. +# When performing randomly cropping and resizing, we can distinguish between two situations: (a) cropped image A provides a local view of cropped image B, or (b) cropped images C and D show neighboring views of the same image (figure credit - [Ting Chen and Geoffrey Hinton](https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html)). +# +#
+# +# While situation (a) requires the model to learn some sort of scale invariance to make crops A and B similar in latent space, situation (b) is more challenging since the model needs to recognize an object beyond its limited view. +# However, without color distortion, there is a loophole that the model can exploit, namely that different crops of the same image usually look very similar in color space. +# Consider the picture of the dog above. +# Simply from the color of the fur and the green color tone of the background, you can reason that two patches belong to the same image without actually recognizing the dog in the picture. +# In this case, the model might end up focusing only on the color histograms of the images, and ignore other more generalizable features. +# If, however, we distort the colors in the two patches randomly and independently of each other, the model cannot rely on this simple feature anymore. +# Hence, by combining random cropping and color distortions, the model can only match two patches by learning generalizable representations. +# +# Overall, for our experiments, we apply a set of 5 transformations following the original SimCLR setup: random horizontal flip, crop-and-resize, color distortion, random grayscale, and gaussian blur. +# In comparison to the [original implementation](https://github.com/google-research/simclr), we reduce the effect of the color jitter slightly (0.5 instead of 0.8 for brightness, contrast, and saturation, and 0.1 instead of 0.2 for hue). +# In our experiments, this setting obtained better performance and was faster and more stable to train. +# If, for instance, the brightness scale highly varies in a dataset, the +# original settings can be more beneficial since the model can't rely on +# this information anymore to distinguish between images. + +# %% +contrast_transforms = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop(size=96), + transforms.RandomApply([transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1)], p=0.8), + transforms.RandomGrayscale(p=0.2), + transforms.GaussianBlur(kernel_size=9), + transforms.ToTensor(), + transforms.Normalize((0.5,), (0.5,)), + ] +) + +# %% [markdown] +# After discussing the data augmentation techniques, we can now focus on the dataset. +# In this tutorial, we will use the [STL10 dataset](https://cs.stanford.edu/~acoates/stl10/), which, similarly to CIFAR10, contains images of 10 classes: airplane, bird, car, cat, deer, dog, horse, monkey, ship, truck. +# However, the images have a higher resolution, namely $96\times 96$ pixels, and we are only provided with 500 labeled images per class. +# Additionally, we have a much larger set of $100,000$ unlabeled images which are similar to the training images but are sampled from a wider range of animals and vehicles. +# This makes the dataset ideal to showcase the benefits that self-supervised learning offers. +# +# Luckily, the STL10 dataset is provided through torchvision. +# Keep in mind, however, that since this dataset is relatively large and has a considerably higher resolution than CIFAR10, it requires more disk space (~3GB) and takes a bit of time to download. +# For our initial discussion of self-supervised learning and SimCLR, we +# will create two data loaders with our contrastive transformations above: +# the `unlabeled_data` will be used to train our model via contrastive +# learning, and `train_data_contrast` will be used as a validation set in +# contrastive learning. + +# %% +unlabeled_data = STL10( + root=DATASET_PATH, + split="unlabeled", + download=True, + transform=ContrastiveTransformations(contrast_transforms, n_views=2), +) +train_data_contrast = STL10( + root=DATASET_PATH, + split="train", + download=True, + transform=ContrastiveTransformations(contrast_transforms, n_views=2), +) + +# %% [markdown] +# Finally, before starting with our implementation of SimCLR, let's look +# at some example image pairs sampled with our augmentations: + +# %% +# Visualize some examples +L.seed_everything(42) +NUM_IMAGES = 6 +imgs = torch.stack([img for idx in range(NUM_IMAGES) for img in unlabeled_data[idx][0]], dim=0) +img_grid = torchvision.utils.make_grid(imgs, nrow=6, normalize=True, pad_value=0.9) +img_grid = img_grid.permute(1, 2, 0) + +plt.figure(figsize=(10, 5)) +plt.title("Augmented image examples of the STL10 dataset") +plt.imshow(img_grid) +plt.axis("off") +plt.show() +plt.close() + +# %% [markdown] +# We see the wide variety of our data augmentation, including randomly cropping, grayscaling, gaussian blur, and color distortion. +# Thus, it remains a challenging task for the model to match two, independently augmented patches of the same image. + +# %% [markdown] +# ### SimCLR implementation +# +# Using the data loader pipeline above, we can now implement SimCLR. +# At each iteration, we get for every image $x$ two differently augmented versions, which we refer to as $\tilde{x}_i$ and $\tilde{x}_j$. +# Both of these images are encoded into a one-dimensional feature vector, between which we want to maximize similarity which minimizes it to all other images in the batch. +# The encoder network is split into two parts: a base encoder network $f(\cdot)$, and a projection head $g(\cdot)$. +# The base network is usually a deep CNN as we have seen in e.g. [Tutorial 5](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial5/Inception_ResNet_DenseNet.html) before, and is responsible for extracting a representation vector from the augmented data examples. +# In our experiments, we will use the common ResNet-18 architecture as $f(\cdot)$, and refer to the output as $f(\tilde{x}_i)=h_i$. +# The projection head $g(\cdot)$ maps the representation $h$ into a space where we apply the contrastive loss, i.e., compare similarities between vectors. +# It is often chosen to be a small MLP with non-linearities, and for simplicity, we follow the original SimCLR paper setup by defining it as a two-layer MLP with ReLU activation in the hidden layer. +# Note that in the follow-up paper, [SimCLRv2](https://arxiv.org/abs/2006.10029), the authors mention that larger/wider MLPs can boost the performance considerably. +# This is why we apply an MLP with four times larger hidden dimensions, but deeper MLPs showed to overfit on the given dataset. +# The general setup is visualized below (figure credit - [Ting Chen et al. ](https://arxiv.org/abs/2006.10029)): +# +#
+# +# After finishing the training with contrastive learning, we will remove the projection head $g(\cdot)$, and use $f(\cdot)$ as a pretrained feature extractor. +# The representations $z$ that come out of the projection head $g(\cdot)$ have been shown to perform worse than those of the base network $f(\cdot)$ when finetuning the network for a new task. +# This is likely because the representations $z$ are trained to become invariant to many features like the color that can be important for downstream tasks. +# Thus, $g(\cdot)$ is only needed for the contrastive learning stage. +# +# Now that the architecture is described, let's take a closer look at how we train the model. +# As mentioned before, we want to maximize the similarity between the representations of the two augmented versions of the same image, i.e., $z_i$ and $z_j$ in the figure above, while minimizing it to all other examples in the batch. +# SimCLR thereby applies the InfoNCE loss, originally proposed by [Aaron van den Oord et al. ](https://arxiv.org/abs/1807.03748) for contrastive learning. +# In short, the InfoNCE loss compares the similarity of $z_i$ and $z_j$ to the similarity of $z_i$ to any other representation in the batch by performing a softmax over the similarity values. +# The loss can be formally written as: +# $$ +# \ell_{i,j}=-\log \frac{\exp(\text{sim}(z_i,z_j)/\tau)}{\sum_{k=1}^{2N}\mathbb{1}_{[k\neq i]}\exp(\text{sim}(z_i,z_k)/\tau)}=-\text{sim}(z_i,z_j)/\tau+\log\left[\sum_{k=1}^{2N}\mathbb{1}_{[k\neq i]}\exp(\text{sim}(z_i,z_k)/\tau)\right] +# $$ +# The function $\text{sim}$ is a similarity metric, and the hyperparameter $\tau$ is called temperature determining how peaked the distribution is. +# Since many similarity metrics are bounded, the temperature parameter allows us to balance the influence of many dissimilar image patches versus one similar patch. +# The similarity metric that is used in SimCLR is cosine similarity, as defined below: +# $$ +# \text{sim}(z_i,z_j) = \frac{z_i^\top \cdot z_j}{||z_i||\cdot||z_j||} +# $$ +# The maximum cosine similarity possible is $1$, while the minimum is $-1$. +# In general, we will see that the features of two different images will converge to a cosine similarity around zero since the minimum, $-1$, would require $z_i$ and $z_j$ to be in the exact opposite direction in all feature dimensions, which does not allow for great flexibility. +# +# Finally, now that we have discussed all details, let's implement SimCLR below as a PyTorch Lightning module: + + +# %% +class SimCLR(L.LightningModule): + def __init__(self, hidden_dim, lr, temperature, weight_decay, max_epochs=500): + super().__init__() + self.save_hyperparameters() + assert self.hparams.temperature > 0.0, "The temperature must be a positive float!" + # Base model f(.) + self.convnet = torchvision.models.resnet18( + pretrained=False, num_classes=4 * hidden_dim + ) # num_classes is the output size of the last linear layer + # The MLP for g(.) consists of Linear->ReLU->Linear + self.convnet.fc = nn.Sequential( + self.convnet.fc, # Linear(ResNet output, 4*hidden_dim) + nn.ReLU(inplace=True), + nn.Linear(4 * hidden_dim, hidden_dim), + ) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay) + lr_scheduler = optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=self.hparams.max_epochs, eta_min=self.hparams.lr / 50 + ) + return [optimizer], [lr_scheduler] + + def info_nce_loss(self, batch, mode="train"): + imgs, _ = batch + imgs = torch.cat(imgs, dim=0) + + # Encode all images + feats = self.convnet(imgs) + # Calculate cosine similarity + cos_sim = F.cosine_similarity(feats[:, None, :], feats[None, :, :], dim=-1) + # Mask out cosine similarity to itself + self_mask = torch.eye(cos_sim.shape[0], dtype=torch.bool, device=cos_sim.device) + cos_sim.masked_fill_(self_mask, -9e15) + # Find positive example -> batch_size//2 away from the original example + pos_mask = self_mask.roll(shifts=cos_sim.shape[0] // 2, dims=0) + # InfoNCE loss + cos_sim = cos_sim / self.hparams.temperature + nll = -cos_sim[pos_mask] + torch.logsumexp(cos_sim, dim=-1) + nll = nll.mean() + + # Logging loss + self.log(mode + "_loss", nll) + # Get ranking position of positive example + comb_sim = torch.cat( + [cos_sim[pos_mask][:, None], cos_sim.masked_fill(pos_mask, -9e15)], # First position positive example + dim=-1, + ) + sim_argsort = comb_sim.argsort(dim=-1, descending=True).argmin(dim=-1) + # Logging ranking metrics + self.log(mode + "_acc_top1", (sim_argsort == 0).float().mean()) + self.log(mode + "_acc_top5", (sim_argsort < 5).float().mean()) + self.log(mode + "_acc_mean_pos", 1 + sim_argsort.float().mean()) + + return nll + + def training_step(self, batch, batch_idx): + return self.info_nce_loss(batch, mode="train") + + def validation_step(self, batch, batch_idx): + self.info_nce_loss(batch, mode="val") + + +# %% [markdown] +# Alternatively to performing the validation on the contrastive learning loss as well, we could also take a simple, small downstream task, and track the performance of the base network $f(\cdot)$ on that. +# However, in this tutorial, we will restrict ourselves to the STL10 +# dataset where we use the task of image classification on STL10 as our +# test task. + +# %% [markdown] +# ### Training +# +# Now that we have implemented SimCLR and the data loading pipeline, we are ready to train the model. +# We will use the same training function setup as usual. +# For saving the best model checkpoint, we track the metric `val_acc_top5`, which describes how often the correct image patch is within the top-5 most similar examples in the batch. +# This is usually less noisy than the top-1 metric, making it a better metric to choose the best model from. + + +# %% +def train_simclr(batch_size, max_epochs=500, **kwargs): + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "SimCLR"), + accelerator="auto", + devices=1, + max_epochs=max_epochs, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc_top5"), + LearningRateMonitor("epoch"), + ], + ) + trainer.logger._default_hp_metric = None # Optional logging argument that we don't need + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "SimCLR.ckpt") + if os.path.isfile(pretrained_filename): + print(f"Found pretrained model at {pretrained_filename}, loading...") + # Automatically loads the model with the saved hyperparameters + model = SimCLR.load_from_checkpoint(pretrained_filename) + else: + train_loader = data.DataLoader( + unlabeled_data, + batch_size=batch_size, + shuffle=True, + drop_last=True, + pin_memory=True, + num_workers=NUM_WORKERS, + ) + val_loader = data.DataLoader( + train_data_contrast, + batch_size=batch_size, + shuffle=False, + drop_last=False, + pin_memory=True, + num_workers=NUM_WORKERS, + ) + L.seed_everything(42) # To be reproducable + model = SimCLR(max_epochs=max_epochs, **kwargs) + trainer.fit(model, train_loader, val_loader) + # Load best checkpoint after training + model = SimCLR.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + return model + + +# %% [markdown] +# A common observation in contrastive learning is that the larger the batch size, the better the models perform. +# A larger batch size allows us to compare each image to more negative examples, leading to overall smoother loss gradients. +# However, in our case, we experienced that a batch size of 256 was sufficient to get good results. + +# %% +simclr_model = train_simclr( + batch_size=256, hidden_dim=128, lr=5e-4, temperature=0.07, weight_decay=1e-4, max_epochs=500 +) + +# %% [markdown] +# To get an intuition of how training with contrastive learning behaves, we can take a look at the TensorBoard below: + +# %% +# %tensorboard --logdir ../saved_models/tutorial17/tensorboards/SimCLR/ + +# %% [markdown] +#
![tensorboard simclr](tensorboard_simclr.png){width="1200px"}
+# +# One thing to note is that contrastive learning benefits a lot from long training. +# The shown plot above is from a training that took approx. +# 1 day on a NVIDIA TitanRTX. +# Training the model for even longer might reduce its loss further, but we did not experience any gains from it for the downstream task on image classification. +# In general, contrastive learning can also benefit from using larger models, if sufficient unlabeled data is available. + +# %% [markdown] +# ## Logistic Regression +# +#
+# After we have trained our model via contrastive learning, we can deploy it on downstream tasks and see how well it performs with little data. +# A common setup, which also verifies whether the model has learned generalized representations, is to perform Logistic Regression on the features. +# In other words, we learn a single, linear layer that maps the representations to a class prediction. +# Since the base network $f(\cdot)$ is not changed during the training process, the model can only perform well if the representations of $h$ describe all features that might be necessary for the task. +# Further, we do not have to worry too much about overfitting since we have very few parameters that are trained. +# Hence, we might expect that the model can perform well even with very little data. +# +# First, let's implement a simple Logistic Regression setup for which we assume that the images already have been encoded in their feature vectors. +# If very little data is available, it might be beneficial to dynamically encode the images during training so that we can also apply data augmentations. +# However, the way we implement it here is much more efficient and can be trained within a few seconds. +# Further, using data augmentations did not show any significant gain in this simple setup. + + +# %% +class LogisticRegression(L.LightningModule): + def __init__(self, feature_dim, num_classes, lr, weight_decay, max_epochs=100): + super().__init__() + self.save_hyperparameters() + # Mapping from representation h to classes + self.model = nn.Linear(feature_dim, num_classes) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay) + lr_scheduler = optim.lr_scheduler.MultiStepLR( + optimizer, milestones=[int(self.hparams.max_epochs * 0.6), int(self.hparams.max_epochs * 0.8)], gamma=0.1 + ) + return [optimizer], [lr_scheduler] + + def _calculate_loss(self, batch, mode="train"): + feats, labels = batch + preds = self.model(feats) + loss = F.cross_entropy(preds, labels) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + self.log(mode + "_loss", loss) + self.log(mode + "_acc", acc) + return loss + + def training_step(self, batch, batch_idx): + return self._calculate_loss(batch, mode="train") + + def validation_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="val") + + def test_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="test") + + +# %% [markdown] +# The data we use is the training and test set of STL10. +# The training contains 500 images per class, while the test set has 800 images per class. + +# %% +img_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) + +train_img_data = STL10(root=DATASET_PATH, split="train", download=True, transform=img_transforms) +test_img_data = STL10(root=DATASET_PATH, split="test", download=True, transform=img_transforms) + +print("Number of training examples:", len(train_img_data)) +print("Number of test examples:", len(test_img_data)) + +# %% [markdown] +# Next, we implement a small function to encode all images in our datasets. +# The output representations are then used as inputs to the Logistic Regression model. + + +# %% +@torch.no_grad() +def prepare_data_features(model, dataset): + # Prepare model + network = deepcopy(model.convnet) + network.fc = nn.Identity() # Removing projection head g(.) + network.eval() + network.to(device) + + # Encode all images + data_loader = data.DataLoader(dataset, batch_size=64, num_workers=NUM_WORKERS, shuffle=False, drop_last=False) + feats, labels = [], [] + for batch_imgs, batch_labels in tqdm(data_loader): + batch_imgs = batch_imgs.to(device) + batch_feats = network(batch_imgs) + feats.append(batch_feats.detach().cpu()) + labels.append(batch_labels) + + feats = torch.cat(feats, dim=0) + labels = torch.cat(labels, dim=0) + + # Sort images by labels + labels, idxs = labels.sort() + feats = feats[idxs] + + return data.TensorDataset(feats, labels) + + +# %% [markdown] +# Let's apply the function to both training and test set below. + +# %% +train_feats_simclr = prepare_data_features(simclr_model, train_img_data) +test_feats_simclr = prepare_data_features(simclr_model, test_img_data) + +# %% [markdown] +# Finally, we can write a training function as usual. +# We evaluate the model on the test set every 10 epochs to allow early +# stopping, but the low frequency of the validation ensures that we do not +# overfit too much on the test set. + + +# %% +def train_logreg(batch_size, train_feats_data, test_feats_data, model_suffix, max_epochs=100, **kwargs): + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "LogisticRegression"), + accelerator="auto", + devices=1, + max_epochs=max_epochs, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"), + LearningRateMonitor("epoch"), + ], + enable_progress_bar=False, + check_val_every_n_epoch=10, + ) + trainer.logger._default_hp_metric = None + + # Data loaders + train_loader = data.DataLoader( + train_feats_data, batch_size=batch_size, shuffle=True, drop_last=False, pin_memory=True, num_workers=0 + ) + test_loader = data.DataLoader( + test_feats_data, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True, num_workers=0 + ) + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, f"LogisticRegression_{model_suffix}.ckpt") + if os.path.isfile(pretrained_filename): + print(f"Found pretrained model at {pretrained_filename}, loading...") + model = LogisticRegression.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) # To be reproducable + model = LogisticRegression(**kwargs) + trainer.fit(model, train_loader, test_loader) + model = LogisticRegression.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # Test best model on train and validation set + train_result = trainer.test(model, dataloaders=train_loader, verbose=False) + test_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"train": train_result[0]["test_acc"], "test": test_result[0]["test_acc"]} + + return model, result + + +# %% [markdown] +# Despite the training dataset of STL10 already only having 500 labeled images per class, we will perform experiments with even smaller datasets. +# Specifically, we train a Logistic Regression model for datasets with only 10, 20, 50, 100, 200, and all 500 examples per class. +# This gives us an intuition on how well the representations learned by contrastive learning can be transfered to a image recognition task like this classification. +# First, let's define a function to create the intended sub-datasets from the full training set: + + +# %% +def get_smaller_dataset(original_dataset, num_imgs_per_label): + new_dataset = data.TensorDataset( + *(t.unflatten(0, (10, 500))[:, :num_imgs_per_label].flatten(0, 1) for t in original_dataset.tensors) + ) + return new_dataset + + +# %% [markdown] +# Next, let's run all models. +# Despite us training 6 models, this cell could be run within a minute or two without the pretrained models. + +# %% +results = {} +for num_imgs_per_label in [10, 20, 50, 100, 200, 500]: + sub_train_set = get_smaller_dataset(train_feats_simclr, num_imgs_per_label) + _, small_set_results = train_logreg( + batch_size=64, + train_feats_data=sub_train_set, + test_feats_data=test_feats_simclr, + model_suffix=num_imgs_per_label, + feature_dim=train_feats_simclr.tensors[0].shape[1], + num_classes=10, + lr=1e-3, + weight_decay=1e-3, + ) + results[num_imgs_per_label] = small_set_results + +# %% [markdown] +# Finally, let's plot the results. + +# %% +dataset_sizes = sorted(k for k in results) +test_scores = [results[k]["test"] for k in dataset_sizes] + +fig = plt.figure(figsize=(6, 4)) +plt.plot( + dataset_sizes, + test_scores, + "--", + color="#000", + marker="*", + markeredgecolor="#000", + markerfacecolor="y", + markersize=16, +) +plt.xscale("log") +plt.xticks(dataset_sizes, labels=dataset_sizes) +plt.title("STL10 classification over dataset size", fontsize=14) +plt.xlabel("Number of images per class") +plt.ylabel("Test accuracy") +plt.minorticks_off() +plt.show() + +for k, score in zip(dataset_sizes, test_scores): + print(f"Test accuracy for {k:3d} images per label: {100*score:4.2f}%") + +# %% [markdown] +# As one would expect, the classification performance improves the more data we have. +# However, with only 10 images per class, we can already classify more than 60% of the images correctly. +# This is quite impressive, considering that the images are also higher dimensional than e.g. CIFAR10. +# With the full dataset, we achieve an accuracy of 81%. +# The increase between 50 to 500 images per class might suggest a linear increase in performance with an exponentially larger dataset. +# However, with even more data, we could also finetune $f(\cdot)$ in the training process, allowing for the representations to adapt more to the specific classification task given. +# +# To set the results above into perspective, we will train the base +# network, a ResNet-18, on the classification task from scratch. + +# %% [markdown] +# ## Baseline +# +# As a baseline to our results above, we will train a standard ResNet-18 with random initialization on the labeled training set of STL10. +# The results will give us an indication of the advantages that contrastive learning on unlabeled data has compared to using only supervised training. +# The implementation of the model is straightforward since the ResNet +# architecture is provided in the torchvision library. + + +# %% +class ResNet(L.LightningModule): + def __init__(self, num_classes, lr, weight_decay, max_epochs=100): + super().__init__() + self.save_hyperparameters() + self.model = torchvision.models.resnet18(pretrained=False, num_classes=num_classes) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay) + lr_scheduler = optim.lr_scheduler.MultiStepLR( + optimizer, milestones=[int(self.hparams.max_epochs * 0.7), int(self.hparams.max_epochs * 0.9)], gamma=0.1 + ) + return [optimizer], [lr_scheduler] + + def _calculate_loss(self, batch, mode="train"): + imgs, labels = batch + preds = self.model(imgs) + loss = F.cross_entropy(preds, labels) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + self.log(mode + "_loss", loss) + self.log(mode + "_acc", acc) + return loss + + def training_step(self, batch, batch_idx): + return self._calculate_loss(batch, mode="train") + + def validation_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="val") + + def test_step(self, batch, batch_idx): + self._calculate_loss(batch, mode="test") + + +# %% [markdown] +# It is clear that the ResNet easily overfits on the training data since its parameter count is more than 1000 times larger than the dataset size. +# To make the comparison to the contrastive learning models fair, we apply data augmentations similar to the ones we used before: horizontal flip, crop-and-resize, grayscale, and gaussian blur. +# Color distortions as before are not used because the color distribution of an image showed to be an important feature for the classification. +# Hence, we observed no noticeable performance gains when adding color distortions to the set of augmentations. +# Similarly, we restrict the resizing operation before cropping to the max. +# 125% of its original resolution, instead of 1250% as done in SimCLR. +# This is because, for classification, the model needs to recognize the full object, while in contrastive learning, we only want to check whether two patches belong to the same image/object. +# Hence, the chosen augmentations below are overall weaker than in the contrastive learning case. + +# %% +train_transforms = transforms.Compose( + [ + transforms.RandomHorizontalFlip(), + transforms.RandomResizedCrop(size=96, scale=(0.8, 1.0)), + transforms.RandomGrayscale(p=0.2), + transforms.GaussianBlur(kernel_size=9, sigma=(0.1, 0.5)), + transforms.ToTensor(), + transforms.Normalize((0.5,), (0.5,)), + ] +) + +train_img_aug_data = STL10(root=DATASET_PATH, split="train", download=True, transform=train_transforms) + +# %% [markdown] +# The training function for the ResNet is almost identical to the Logistic Regression setup. +# Note that we allow the ResNet to perform validation every 2 epochs to +# also check whether the model overfits strongly in the first iterations +# or not. + + +# %% +def train_resnet(batch_size, max_epochs=100, **kwargs): + trainer = L.Trainer( + default_root_dir=os.path.join(CHECKPOINT_PATH, "ResNet"), + accelerator="auto", + devices=1, + max_epochs=max_epochs, + callbacks=[ + ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"), + LearningRateMonitor("epoch"), + ], + check_val_every_n_epoch=2, + ) + trainer.logger._default_hp_metric = None + + # Data loaders + train_loader = data.DataLoader( + train_img_aug_data, + batch_size=batch_size, + shuffle=True, + drop_last=True, + pin_memory=True, + num_workers=NUM_WORKERS, + ) + test_loader = data.DataLoader( + test_img_data, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True, num_workers=NUM_WORKERS + ) + + # Check whether pretrained model exists. If yes, load it and skip training + pretrained_filename = os.path.join(CHECKPOINT_PATH, "ResNet.ckpt") + if os.path.isfile(pretrained_filename): + print("Found pretrained model at %s, loading..." % pretrained_filename) + model = ResNet.load_from_checkpoint(pretrained_filename) + else: + L.seed_everything(42) # To be reproducable + model = ResNet(**kwargs) + trainer.fit(model, train_loader, test_loader) + model = ResNet.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) + + # Test best model on validation set + train_result = trainer.test(model, dataloaders=train_loader, verbose=False) + val_result = trainer.test(model, dataloaders=test_loader, verbose=False) + result = {"train": train_result[0]["test_acc"], "test": val_result[0]["test_acc"]} + + return model, result + + +# %% [markdown] +# Finally, let's train the model and check its results: + +# %% +resnet_model, resnet_result = train_resnet(batch_size=64, num_classes=10, lr=1e-3, weight_decay=2e-4, max_epochs=100) +print(f"Accuracy on training set: {100*resnet_result['train']:4.2f}%") +print(f"Accuracy on test set: {100*resnet_result['test']:4.2f}%") + +# %% [markdown] +# The ResNet trained from scratch achieves 73.31% on the test set. +# This is almost 8% less than the contrastive learning model, and even slightly less than SimCLR achieves with 1/10 of the data. +# This shows that self-supervised, contrastive learning provides +# considerable performance gains by leveraging large amounts of unlabeled +# data when little labeled data is available. + +# %% [markdown] +# ## Conclusion +# +# In this tutorial, we have discussed self-supervised contrastive learning and implemented SimCLR as an example method. +# We have applied it to the STL10 dataset and showed that it can learn generalizable representations that we can use to train simple classification models. +# With 500 images per label, it achieved an 8% higher accuracy than a similar model solely trained from supervision and performs on par with it when only using a tenth of the labeled data. +# Our experimental results are limited to a single dataset, but recent works such as [Ting Chen et al. ](https://arxiv.org/abs/2006.10029) showed similar trends for larger datasets like ImageNet. +# Besides the discussed hyperparameters, the size of the model seems to be important in contrastive learning as well. +# If a lot of unlabeled data is available, larger models can achieve much stronger results and come close to their supervised baselines. +# Further, there are also approaches for combining contrastive and supervised learning, leading to performance gains beyond supervision (see [Khosla et al.](https://arxiv.org/abs/2004.11362)). +# Moreover, contrastive learning is not the only approach to self-supervised learning that has come up in the last two years and showed great results. +# Other methods include distillation-based methods like [BYOL](https://arxiv.org/abs/2006.07733) and redundancy reduction techniques like [Barlow Twins](https://arxiv.org/abs/2103.03230). +# There is a lot more to explore in the self-supervised domain, and more, impressive steps ahead are to be expected. +# +# ### References +# +# [1] Chen, T., Kornblith, S., Norouzi, M., and Hinton, G. (2020). +# A simple framework for contrastive learning of visual representations. +# In International conference on machine learning (pp. +# 1597-1607). +# PMLR. +# ([link](https://arxiv.org/abs/2002.05709)) +# +# [2] Chen, T., Kornblith, S., Swersky, K., Norouzi, M., and Hinton, G. (2020). +# Big self-supervised models are strong semi-supervised learners. +# NeurIPS 2021 ([link](https://arxiv.org/abs/2006.10029)). +# +# [3] Oord, A. V. D., Li, Y., and Vinyals, O. +# (2018). +# Representation learning with contrastive predictive coding. +# arXiv preprint arXiv:1807.03748. +# ([link](https://arxiv.org/abs/1807.03748)) +# +# [4] Grill, J.B., Strub, F., Altché, F., Tallec, C., Richemond, P.H., Buchatskaya, E., Doersch, C., Pires, B.A., Guo, Z.D., Azar, M.G. +# and Piot, B. +# (2020). +# Bootstrap your own latent: A new approach to self-supervised learning. +# arXiv preprint arXiv:2006.07733. +# ([link](https://arxiv.org/abs/2006.07733)) +# +# [5] Khosla, P., Teterwak, P., Wang, C., Sarna, A., Tian, Y., Isola, P., Maschinot, A., Liu, C. and Krishnan, D. (2020). +# Supervised contrastive learning. +# arXiv preprint arXiv:2004.11362. +# ([link](https://arxiv.org/abs/2004.11362)) +# +# [6] Zbontar, J., Jing, L., Misra, I., LeCun, Y. and Deny, S. (2021). +# Barlow twins: Self-supervised learning via redundancy reduction. +# arXiv preprint arXiv:2103.03230. +# ([link](https://arxiv.org/abs/2103.03230)) diff --git a/course_UvA-DL/13-contrastive-learning/crop_views.svg b/course_UvA-DL/13-contrastive-learning/crop_views.svg new file mode 100644 index 0000000..6ea2ba9 --- /dev/null +++ b/course_UvA-DL/13-contrastive-learning/crop_views.svg @@ -0,0 +1 @@ + diff --git a/course_UvA-DL/13-contrastive-learning/simclr_contrastive_learning.png b/course_UvA-DL/13-contrastive-learning/simclr_contrastive_learning.png new file mode 100644 index 0000000..622b411 Binary files /dev/null and b/course_UvA-DL/13-contrastive-learning/simclr_contrastive_learning.png differ diff --git a/course_UvA-DL/13-contrastive-learning/simclr_data_augmentations.jpg b/course_UvA-DL/13-contrastive-learning/simclr_data_augmentations.jpg new file mode 100644 index 0000000..57a440c Binary files /dev/null and b/course_UvA-DL/13-contrastive-learning/simclr_data_augmentations.jpg differ diff --git a/course_UvA-DL/13-contrastive-learning/simclr_network_setup.svg b/course_UvA-DL/13-contrastive-learning/simclr_network_setup.svg new file mode 100644 index 0000000..687a62d --- /dev/null +++ b/course_UvA-DL/13-contrastive-learning/simclr_network_setup.svg @@ -0,0 +1,228 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/course_UvA-DL/13-contrastive-learning/tensorboard_simclr.png b/course_UvA-DL/13-contrastive-learning/tensorboard_simclr.png new file mode 100644 index 0000000..bfe6859 Binary files /dev/null and b/course_UvA-DL/13-contrastive-learning/tensorboard_simclr.png differ diff --git a/flash_tutorials/electricity_forecasting/.meta.yml b/flash_tutorials/electricity_forecasting/.meta.yml new file mode 100644 index 0000000..ab165e1 --- /dev/null +++ b/flash_tutorials/electricity_forecasting/.meta.yml @@ -0,0 +1,24 @@ +title: Electricity Price Forecasting with N-BEATS +author: Ethan Harris (ethan@pytorchlightning.ai) +created: 2021-11-23 +updated: 2021-12-16 +license: CC BY-SA +build: 3 +tags: + - Tabular + - Forecasting + - Timeseries +description: | + This tutorial covers using Lightning Flash and it's integration with PyTorch Forecasting to train an autoregressive + model (N-BEATS) on hourly electricity pricing data. We show how the built-in interpretability tools from PyTorch + Forecasting can be used with Flash to plot the trend and daily seasonality in our data discovered by the model. We + also cover how features from PyTorch Lightning such as the learning rate finder can be used easily with Flash. As a + bonus, we show hat we can resample daily observations from the data to discover weekly trends instead. +requirements: + - pandas==1.1.5 + - lightning-flash[tabular]>=0.6.0 + - pytorch-lightning==1.3.6 # todo: update to latest + - numpy<1.24 +accelerator: + - GPU + - CPU diff --git a/flash_tutorials/electricity_forecasting/.thumb.svg b/flash_tutorials/electricity_forecasting/.thumb.svg new file mode 100644 index 0000000..3f8037e --- /dev/null +++ b/flash_tutorials/electricity_forecasting/.thumb.svg @@ -0,0 +1 @@ + diff --git a/flash_tutorials/electricity_forecasting/diagram.png b/flash_tutorials/electricity_forecasting/diagram.png new file mode 100644 index 0000000..47120db Binary files /dev/null and b/flash_tutorials/electricity_forecasting/diagram.png differ diff --git a/flash_tutorials/electricity_forecasting/electricity_forecasting.py b/flash_tutorials/electricity_forecasting/electricity_forecasting.py new file mode 100644 index 0000000..621cd30 --- /dev/null +++ b/flash_tutorials/electricity_forecasting/electricity_forecasting.py @@ -0,0 +1,311 @@ +# %% [markdown] +# In this tutorial we'll look at using [Lightning Flash](https://github.com/Lightning-AI/lightning-flash) and it's +# integration with [PyTorch Forecasting](https://github.com/jdb78/pytorch-forecasting) for autoregressive modelling of +# electricity prices using [the N-BEATS model](https://arxiv.org/abs/1905.10437). +# We'll start by using N-BEATS to uncover daily patterns (seasonality) from hourly observations and then show how we can +# resample daily averages to uncover weekly patterns too. +# +# Along the way, we'll see how the built-in tools from PyTorch Lightning, like the learning rate finder, can be used +# seamlessly with Flash to help make the process of putting a model together as smooth as possible. + +# %% + +import os +from typing import Any, Dict + +import flash +import matplotlib.pyplot as plt +import pandas as pd +import torch +from flash.core.data.utils import download_data +from flash.core.integrations.pytorch_forecasting import convert_predictions +from flash.tabular.forecasting import TabularForecaster, TabularForecastingData + +DATASET_PATH = os.environ.get("PATH_DATASETS", "data/") + +# %% [markdown] +# ## Loading the data +# +# We'll use the Spanish hourly energy demand generation and weather data set from Kaggle: +# https://www.kaggle.com/nicholasjhana/energy-consumption-generation-prices-and-weather +# +# First, download the data: + +# %% +download_data("https://pl-flash-data.s3.amazonaws.com/kaggle_electricity.zip", DATASET_PATH) + +# %% [markdown] +# ## Data loading +# +# To load the data, we start by loading the CSV file into a pandas DataFrame: + +# %% +df_energy_hourly = pd.read_csv(f"{DATASET_PATH}/energy_dataset.csv", parse_dates=["time"]) + +# %% [markdown] +# Before we can load the data into Flash, there are a few preprocessing steps we need to take. +# The first preprocessing step is to set the `time` field as the index (formatted as a datetime). +# The second step is to resample the data to the desired frequency in case it is different from the desired observation +# frequency. +# Since we are performing autoregressive modelling, we can remove all columns except for `"price actual"`. +# +# For the third preprocessing step, we need to create a "time_idx" column. +# The "time_idx" column should contain integers corresponding to the observation index (e.g. in our case the difference +# between two "time_idx" values is the number of hours between the observations). +# To do this we convert the datetime to an index by taking the nanoseconds value and dividing by the number of +# nanoseconds in a single unit of our chosen frequency. +# We then subtract the minimum value so it starts at zero (although it would still work without this step). +# +# The Flash `TabularForecastingData` (which uses the `TimeSeriesDataSet` from PyTorch Forecasting internally) also +# supports loading data from multiple time series (e.g. you may have electricity data from multiple countries). +# To indicate that our data is all from the same series, we add a `constant` column with a constant value of zero. +# +# Here's the full preprocessing function: + +# %% + + +def preprocess(df: pd.DataFrame, frequency: str = "1H") -> pd.DataFrame: + df["time"] = pd.to_datetime(df["time"], utc=True, infer_datetime_format=True) + df.set_index("time", inplace=True) + + df = df.resample(frequency).mean() + + df = df.filter(["price actual"]) + + df["time_idx"] = (df.index.view(int) / pd.Timedelta(frequency).value).astype(int) + df["time_idx"] -= df["time_idx"].min() + + df["constant"] = 0 + + return df + + +df_energy_hourly = preprocess(df_energy_hourly) + +# %% [markdown] +# ## Creating the Flash DataModule +# +# Now, we can create a `TabularForecastingData`. +# The role of the `TabularForecastingData` is to split up our time series into windows which include a region to encode +# (of size `max_encoder_length`) and a region to predict (of size `max_prediction_length`) which will be used to compute +# the loss. +# The size of the prediction window should be chosen depending on the kinds of trends we would like our model to +# uncover. +# In our case, we are interested in how electricity prices change throughout the day, so a one day prediction window +# (`max_prediction_length = 24`) makes sense here. +# The size of the encoding window can vary, however, in the [N-BEATS paper](https://arxiv.org/abs/1905.10437) the +# authors suggest using an encoder length of between two and ten times the prediction length. +# We therefore choose two days (`max_encoder_length = 48`) as the encoder length. + +# %% +max_prediction_length = 24 +max_encoder_length = 24 * 2 + +training_cutoff = df_energy_hourly["time_idx"].max() - max_prediction_length + +datamodule = TabularForecastingData.from_data_frame( + time_idx="time_idx", + target="price actual", + group_ids=["constant"], + max_encoder_length=max_encoder_length, + max_prediction_length=max_prediction_length, + time_varying_unknown_reals=["price actual"], + train_data_frame=df_energy_hourly[df_energy_hourly["time_idx"] <= training_cutoff], + val_data_frame=df_energy_hourly, + batch_size=256, +) + +# %% [markdown] +# ## Creating the Flash Task +# +# Now, we're ready to create a `TabularForecaster`. +# The N-BEATS model has two primary hyper-parameters:`"widths"`, and `"backcast_loss_ratio"`. +# In the [PyTorch Forecasting Documentation](https://pytorch-forecasting.readthedocs.io/en/latest/api/pytorch_forecasting.models.nbeats.NBeats.html), +# the authors recommend using `"widths"` of `[32, 512]`. +# In order to prevent overfitting with smaller datasets, a good rule of thumb is to limit the number of parameters of +# your model. +# For this reason, we use `"widths"` of `[16, 256]`. +# +# To understand the `"backcast_loss_ratio"`, let's take a look at this diagram of the model taken from +# [the arXiv paper](https://arxiv.org/abs/1905.10437): +# +# ![N-BEATS diagram](diagram.png) +# +# Each 'block' within the N-BEATS architecture includes a forecast output and a backcast which can each yield their own +# loss. +# The `"backcast_loss_ratio"` is the ratio of the backcast loss to the forecast loss. +# A value of `1.0` means that the loss function is simply the sum of the forecast and backcast losses. + +# %% +model = TabularForecaster( + datamodule.parameters, backbone="n_beats", backbone_kwargs={"widths": [16, 256], "backcast_loss_ratio": 1.0} +) + +# %% [markdown] +# ## Finding the learning rate +# +# Tabular models can be particularly sensitive to the choice of learning rate. +# Helpfully, PyTorch Lightning provides a built-in learning rate finder that suggests a suitable learning rate +# automatically. +# To use it, we first create our Trainer. +# We apply gradient clipping (a common technique for tabular tasks) with ``gradient_clip_val=0.01`` in order to help +# prevent our model from over-fitting. +# Here's how to find the learning rate: + +# %% +trainer = flash.Trainer( + max_epochs=3, + gpus=int(torch.cuda.is_available()), + gradient_clip_val=0.01, +) + +res = trainer.tuner.lr_find(model, datamodule=datamodule, min_lr=1e-5) +print(f"Suggested learning rate: {res.suggestion()}") +res.plot(show=True, suggest=True).show() + +# %% [markdown] +# Once the suggest learning rate has been found, we can update our model with it: + +# %% +model.learning_rate = res.suggestion() + +# %% [markdown] +# ## Training the model +# Now all we have to do is train the model! + +# %% +trainer.fit(model, datamodule=datamodule) + +# %% [markdown] +# ## Plot the interpretation +# +# An important feature of the N-BEATS model is that it can be configured to produce an interpretable prediction that is +# split into both a low frequency (trend) component and a high frequency (seasonality) component. +# For hourly observations, we might expect the trend component to show us how electricity prices are changing from one +# day to the next (for example, whether prices were generally higher or lower than yesterday). +# In contrast, the seasonality component would be expected to show us the general pattern in prices through the day +# (for example, if there is typically a peak in price around lunch time or a drop at night). +# +# It is often useful to visualize this decomposition and the `TabularForecaster` makes it simple. +# First, we load the best model from our training run and generate some predictions. +# Next, we convert the predictions to the format expected by PyTorch Forecasting using the `convert_predictions` utility +# function. +# Finally, we plot the interpretation using the `pytorch_forecasting_model` attribute. +# Here's the full function: + +# %% + + +def plot_interpretation(model_path: str, predict_df: pd.DataFrame, parameters: Dict[str, Any]): + model = TabularForecaster.load_from_checkpoint(model_path) + datamodule = TabularForecastingData.from_data_frame( + parameters=parameters, + predict_data_frame=predict_df, + batch_size=256, + ) + trainer = flash.Trainer(gpus=int(torch.cuda.is_available())) + predictions = trainer.predict(model, datamodule=datamodule) + predictions, inputs = convert_predictions(predictions) + model.pytorch_forecasting_model.plot_interpretation(inputs, predictions, idx=0) + plt.show() + + +# %% [markdown] +# And now we run the function to plot the trend and seasonality curves: + +# %% +# Todo: Make sure to uncomment the line below if you want to run predictions and visualize the graph +# plot_interpretation(trainer.checkpoint_callback.best_model_path, df_energy_hourly, datamodule.parameters) + +# %% [markdown] +# It worked! The plot shows that the `TabularForecaster` does a reasonable job of modelling the time series and also +# breaks it down into a trend component and a seasonality component (in this case showing daily fluctuations in +# electricity prices). +# +# ## Bonus: Weekly trends +# +# The type of seasonality that the model learns to detect is dictated by the frequency of observations and the length of +# the encoding / prediction window. +# We might imagine that our pipeline could be changed to instead uncover weekly trends if we resample daily +# observations from our data instead of hourly. +# +# We can use our preprocessing function to do this. +# First, we load the data as before then preprocess it (this time setting `frequency = "1D"`). + +# %% +df_energy_daily = pd.read_csv(f"{DATASET_PATH}/energy_dataset.csv", parse_dates=["time"]) +df_energy_daily = preprocess(df_energy_daily, frequency="1D") + +# %% [markdown] +# Now let's create our `TabularForecastingData` as before, this time with a four week encoding window and a one week +# prediction window. + +# %% +max_prediction_length = 1 * 7 +max_encoder_length = 4 * 7 + +training_cutoff = df_energy_daily["time_idx"].max() - max_prediction_length + +datamodule = TabularForecastingData.from_data_frame( + time_idx="time_idx", + target="price actual", + group_ids=["constant"], + max_encoder_length=max_encoder_length, + max_prediction_length=max_prediction_length, + time_varying_unknown_reals=["price actual"], + train_data_frame=df_energy_daily[df_energy_daily["time_idx"] <= training_cutoff], + val_data_frame=df_energy_daily, + batch_size=256, +) + +# %% [markdown] +# Now it's time to create a new model and trainer. +# We run for 24 times the number of epochs this time as we now have around 1/24th of the number of observations. +# This time, instead of using the learning rate finder we just set the learning rate manually: + +# %% +model = TabularForecaster( + datamodule.parameters, + backbone="n_beats", + backbone_kwargs={"widths": [16, 256], "backcast_loss_ratio": 1.0}, + learning_rate=5e-4, +) + +trainer = flash.Trainer( + max_epochs=3 * 24, + check_val_every_n_epoch=24, + gpus=int(torch.cuda.is_available()), + gradient_clip_val=0.01, +) + +# %% [markdown] +# Finally, we train the new model: + +# %% +trainer.fit(model, datamodule=datamodule) + +# %% [markdown] +# Now let's look at what it learned: + +# %% +# Todo: Make sure to uncomment the line below if you want to run predictions and visualize the graph +# plot_interpretation(trainer.checkpoint_callback.best_model_path, df_energy_daily, datamodule.parameters) + +# %% [markdown] +# Success! We can now also see weekly trends / seasonality uncovered by our new model. +# +# ## Closing thoughts and next steps! +# +# This tutorial has shown how Flash and PyTorch Forecasting can be used to train state-of-the-art auto-regressive +# forecasting models (such as N-BEATS). +# We've seen how we can influence the kinds of trends and patterns uncovered by the model by resampling the data and +# changing the hyper-parameters. +# +# There are plenty of ways you could take this tutorial further. +# For example, you could try a more complex model, such as the +# [temporal fusion transformer](https://pytorch-forecasting.readthedocs.io/en/latest/api/pytorch_forecasting.models.temporal_fusion_transformer.TemporalFusionTransformer.html), +# which can handle additional inputs (the kaggle data set we used also includes weather data). +# +# Alternatively, if you want to be a bit more adventurous, you could look at +# [some of the other problems that can solved with Lightning Flash](https://lightning-flash.readthedocs.io/en/stable/?badge=stable). diff --git a/flash_tutorials/image_classification/.meta.yml b/flash_tutorials/image_classification/.meta.yml new file mode 100644 index 0000000..e4f1cfc --- /dev/null +++ b/flash_tutorials/image_classification/.meta.yml @@ -0,0 +1,19 @@ +title: Image Classification on Hymenoptera Dataset +author: Ethan Harris (ethan@pytorchlightning.ai) +created: 2021-11-23 +updated: 2022-08-26 +license: CC BY-SA +build: 3 +tags: + - Image Classification + - Image +description: | + In this tutorial, we'll go over the basics of lightning Flash by finetuning/predictin with an ImageClassifier on [Hymenoptera Dataset](https://www.kaggle.com/ajayrana/hymenoptera-data) containing ants and bees images. +requirements: + - pytorch-lightning==1.6.* + - lightning-flash[image]>=0.7.0 + - torchmetrics<0.11 # todo: task argument is missing + - numpy<1.24 +accelerator: + - GPU + - CPU diff --git a/flash_tutorials/image_classification/image_classification.py b/flash_tutorials/image_classification/image_classification.py new file mode 100644 index 0000000..ba34a8b --- /dev/null +++ b/flash_tutorials/image_classification/image_classification.py @@ -0,0 +1,115 @@ +# %% [markdown] +# In this tutorial, we'll go over the basics of lightning Flash by finetuning/predictin with an ImageClassifier on [Hymenoptera Dataset](https://www.kaggle.com/ajayrana/hymenoptera-data) containing ants and bees images. +# +# # Finetuning +# +# Finetuning consists of four steps: +# +# - 1. Train a source neural network model on a source dataset. For computer vision, it is traditionally the [ImageNet dataset](http://www.image-net.org). As training is costly, library such as [Torchvision](https://pytorch.org/vision/stable/index.html) library supports popular pre-trainer model architectures . In this notebook, we will be using their [resnet-18](https://pytorch.org/hub/pytorch_vision_resnet/). +# +# - 2. Create a new neural network called the target model. Its architecture replicates the source model and parameters, expect the latest layer which is removed. This model without its latest layer is traditionally called a backbone +# +# - 3. Add new layers after the backbone where the latest output size is the number of target dataset categories. Those new layers, traditionally called head will be randomly initialized while backbone will conserve its pre-trained weights from ImageNet. +# +# - 4. Train the target model on a target dataset, such as Hymenoptera Dataset with ants and bees. However, freezing some layers at training start such as the backbone tends to be more stable. In Flash, it can easily be done with `trainer.finetune(..., strategy="freeze")`. It is also common to `freeze/unfreeze` the backbone. In `Flash`, it can be done with `trainer.finetune(..., strategy="freeze_unfreeze")`. If one wants more control on the unfreeze flow, Flash supports `trainer.finetune(..., strategy=MyFinetuningStrategy())` where `MyFinetuningStrategy` is subclassing `pytorch_lightning.callbacks.BaseFinetuning`. + +# %% + +import flash +from flash.core.data.utils import download_data +from flash.image import ImageClassificationData, ImageClassifier + +# %% [markdown] +# ## Download data +# The data are downloaded from a URL, and save in a 'data' directory. + +# %% +download_data("https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip", "data/") + + +# %% [markdown] +# ## Load the data +# +# Flash Tasks have built-in DataModules that you can use to organize your data. Pass in a train, validation and test folders and Flash will take care of the rest. +# Creates a ImageClassificationData object from folders of images arranged in this way: +# +# train/dog/xxx.png +# train/dog/xxy.png +# train/dog/xxz.png +# train/cat/123.png +# train/cat/nsdf3.png +# train/cat/asd932.png + +# %% +datamodule = ImageClassificationData.from_folders( + train_folder="data/hymenoptera_data/train/", + val_folder="data/hymenoptera_data/val/", + test_folder="data/hymenoptera_data/test/", + batch_size=1, +) + + +# %% [markdown] +# ## Build the model +# Create the ImageClassifier task. By default, the ImageClassifier task uses a [resnet-18](https://pytorch.org/hub/pytorch_vision_resnet/) backbone to train or finetune your model. +# For [Hymenoptera Dataset](https://www.kaggle.com/ajayrana/hymenoptera-data) containing ants and bees images, ``datamodule.num_classes`` will be 2. +# Backbone can easily be changed with `ImageClassifier(backbone="resnet50")` or you could provide your own `ImageClassifier(backbone=my_backbone)` + +# %% +model = ImageClassifier(num_classes=datamodule.num_classes) + + +# %% [markdown] +# ## Create the trainer. Run once on data +# The trainer object can be used for training or fine-tuning tasks on new sets of data. +# You can pass in parameters to control the training routine- limit the number of epochs, run on GPUs or TPUs, etc. +# For more details, read the [Trainer Documentation](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=Trainer). +# In this demo, we will limit the fine-tuning to run just one epoch using max_epochs=2. + +# %% +trainer = flash.Trainer(max_epochs=1) + + +# %% [markdown] +# ## Finetune the model + +# %% +trainer.finetune(model, datamodule=datamodule, strategy="freeze") + + +# %% [markdown] +# ## Test the model + +# %% +trainer.test(model, datamodule=datamodule) + + +# %% [markdown] +# ## Save it! + +# %% +trainer.save_checkpoint("image_classification_model.pt") + +# %% [markdown] +# ## Predicting +# **Load the model from a checkpoint** + +# %% +model = ImageClassifier.load_from_checkpoint( + "https://flash-weights.s3.amazonaws.com/0.7.0/image_classification_model.pt" +) + +# %% [markdown] +# **Predict what's on a few images! ants or bees?** + +# %% +datamodule = ImageClassificationData.from_files( + predict_files=[ + "data/hymenoptera_data/val/bees/65038344_52a45d090d.jpg", + "data/hymenoptera_data/val/bees/590318879_68cf112861.jpg", + "data/hymenoptera_data/val/ants/540543309_ddbb193ee5.jpg", + ], + batch_size=1, +) +predictions = trainer.predict(model, datamodule=datamodule) +print(predictions) diff --git a/flash_tutorials/tabular_classification/.meta.yml b/flash_tutorials/tabular_classification/.meta.yml new file mode 100644 index 0000000..8e885a9 --- /dev/null +++ b/flash_tutorials/tabular_classification/.meta.yml @@ -0,0 +1,18 @@ +title: Tabular Classification on Titanic Dataset +author: Ethan Harris (ethan@pytorchlightning.ai) +created: 2021-11-23 +updated: 2022-08-26 +license: CC BY-SA +build: 3 +tags: + - Tabular Classification + - Tabular +description: | + In this notebook, we'll go over the basics of lightning Flash by training a TabularClassifier on [Titanic Dataset](https://www.kaggle.com/c/titanic). +requirements: + - lightning-flash[tabular]>=0.6.0 + - pytorch-lightning==1.3.6 # todo: update to latest + - numpy<1.24 +accelerator: + - GPU + - CPU diff --git a/flash_tutorials/tabular_classification/tabular_classification.py b/flash_tutorials/tabular_classification/tabular_classification.py new file mode 100644 index 0000000..a2089ca --- /dev/null +++ b/flash_tutorials/tabular_classification/tabular_classification.py @@ -0,0 +1,98 @@ +# %% [markdown] +# In this notebook, we'll go over the basics of lightning Flash by training a TabularClassifier on [Titanic Dataset](https://www.kaggle.com/c/titanic). + +# # Training + +# %% + +import flash +from flash.core.data.utils import download_data +from flash.tabular import TabularClassificationData, TabularClassifier + +# %% [markdown] +# ## Download the data +# The data are downloaded from a URL, and save in a 'data' directory. + +# %% +download_data("https://pl-flash-data.s3.amazonaws.com/titanic.zip", "data/") + + +# %% [markdown] +# ## Load the data +# Flash Tasks have built-in DataModules that you can use to organize your data. Pass in a train, validation and test folders and Flash will take care of the rest. +# +# Creates a TabularData relies on [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). + +# %% +datamodule = TabularClassificationData.from_csv( + ["Sex", "Age", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"], + ["Fare"], + target_fields="Survived", + train_file="./data/titanic/titanic.csv", + test_file="./data/titanic/test.csv", + val_split=0.25, + batch_size=8, +) + + +# %% [markdown] +# ## Build the model +# +# Note: Categorical columns will be mapped to the embedding space. Embedding space is set of tensors to be trained associated to each categorical column. + +# %% +model = TabularClassifier.from_data(datamodule) + + +# %% [markdown] +# ## Create the trainer. Run 10 times on data + +# %% +trainer = flash.Trainer(max_epochs=10) + + +# %% [markdown] +# ## Train the model + +# %% +trainer.fit(model, datamodule=datamodule) + +# %% [markdown] +# ## Test model + +# %% +trainer.test(model, datamodule=datamodule) + + +# %% [markdown] +# ## Save it! + +# %% +trainer.save_checkpoint("tabular_classification_model.pt") + + +# %% [markdown] +# # Predicting +# ## Load the model from a checkpoint +# +# `TabularClassifier.load_from_checkpoint` supports both url or local_path to a checkpoint. If provided with an url, the checkpoint will first be downloaded and laoded to re-create the model. + +# %% +model = TabularClassifier.load_from_checkpoint( + "https://flash-weights.s3.amazonaws.com/0.7.0/tabular_classification_model.pt" +) + + +# %% [markdown] +# ## Generate predictions from a sheet file! Who would survive? +# +# `TabularClassifier.predict` support both DataFrame and path to `.csv` file. + +# %% +datamodule = TabularClassificationData.from_csv( + predict_file="data/titanic/titanic.csv", + parameters=datamodule.parameters, + batch_size=8, +) +predictions = trainer.predict(model, datamodule=datamodule) +print(predictions) diff --git a/flash_tutorials/text_classification/.meta.yml b/flash_tutorials/text_classification/.meta.yml new file mode 100644 index 0000000..1b1592a --- /dev/null +++ b/flash_tutorials/text_classification/.meta.yml @@ -0,0 +1,19 @@ +title: Finetuning a Text Classifier on IMDB Dataset +author: Ethan Harris (ethan@pytorchlightning.ai) +created: 2021-11-23 +updated: 2022-08-26 +license: CC BY-SA +build: 3 +tags: + - Text Classification + - Text +description: | + In this notebook, we'll go over the basics of lightning Flash by finetunig a TextClassifier on IMDB Dataset. +requirements: + - pytorch-lightning==1.6.* + - lightning-flash[text]>=0.7.0 + - torchmetrics<0.11 # todo: update to use task=... + - numpy<1.24 +accelerator: + - GPU + - CPU diff --git a/flash_tutorials/text_classification/text_classification.py b/flash_tutorials/text_classification/text_classification.py new file mode 100644 index 0000000..f83bac5 --- /dev/null +++ b/flash_tutorials/text_classification/text_classification.py @@ -0,0 +1,110 @@ +# %% [markdown] +# In this notebook, we'll go over the basics of lightning Flash by finetunig a TextClassifier on [IMDB Dataset](https://paperswithcode.com/dataset/imdb-movie-reviews). +# +# # Finetuning +# +# Finetuning consists of four steps: +# +# - 1. Train a source neural network model on a source dataset. For text classication, it is traditionally a transformer model such as BERT [Bidirectional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805) trained on wikipedia. +# As those model are costly to train, [Transformers](https://github.com/huggingface/transformers) or [FairSeq](https://github.com/pytorch/fairseq) libraries provides popular pre-trained model architectures for NLP. In this notebook, we will be using [tiny-bert](https://huggingface.co/prajjwal1/bert-tiny). +# +# - 2. Create a new neural network the target model. Its architecture replicates all model designs and their parameters on the source model, expect the latest layer which is removed. This model without its latest layers is traditionally called a backbone +# +# - 3. Add new layers after the backbone where the latest output size is the number of target dataset categories. Those new layers, traditionally called head, will be randomly initialized while backbone will conserve its pre-trained weights from ImageNet. +# +# - 4. Train the target model on a target dataset, such as Hymenoptera Dataset with ants and bees. However, freezing some layers at training start such as the backbone tends to be more stable. In Flash, it can easily be done with `trainer.finetune(..., strategy="freeze")`. It is also common to `freeze/unfreeze` the backbone. In `Flash`, it can be done with `trainer.finetune(..., strategy="freeze_unfreeze")`. If a one wants more control on the unfreeze flow, Flash supports `trainer.finetune(..., strategy=MyFinetuningStrategy())` where `MyFinetuningStrategy` is subclassing `pytorch_lightning.callbacks.BaseFinetuning`. + +# %% + +import flash +from flash.core.data.utils import download_data +from flash.text import TextClassificationData, TextClassifier + +# %% [markdown] +# ## Download the data +# The data are downloaded from a URL, and save in a 'data' directory. + +# %% +download_data("https://pl-flash-data.s3.amazonaws.com/imdb.zip", "data/") + + +# %% [markdown] +# ## Load the data +# +# Flash Tasks have built-in DataModules that you can use to organize your data. Pass in a train, validation and test folders and Flash will take care of the rest. +# Creates a TextClassificationData object from csv file. + +# %% +datamodule = TextClassificationData.from_csv( + "review", + "sentiment", + train_file="data/imdb/train.csv", + val_file="data/imdb/valid.csv", + test_file="data/imdb/test.csv", + batch_size=512, # just increased for the example to run fast +) + + +# %% [markdown] +# ## Build the model +# +# Create the TextClassifier task. By default, the TextClassifier task uses a [tiny-bert](https://huggingface.co/prajjwal1/bert-tiny) backbone to train or finetune your model demo. You could use any models from [transformers - Text Classification](https://huggingface.co/models?filter=text-classification,pytorch) +# +# Backbone can easily be changed with such as `TextClassifier(backbone='bert-tiny-mnli')` + +# %% +model = TextClassifier(num_classes=datamodule.num_classes, backbone="prajjwal1/bert-tiny") + + +# %% [markdown] +# ## Create the trainer. Run once on data + +# %% +trainer = flash.Trainer(max_epochs=1) + + +# %% [markdown] +# ## Fine-tune the model +# +# The backbone won't be freezed and the entire model will be finetuned on the imdb dataset + +# %% +trainer.finetune(model, datamodule=datamodule, strategy="freeze") + + +# %% [markdown] +# ## Test model + +# %% +trainer.test(model, datamodule=datamodule) + + +# %% [markdown] +# ## Save it! + +# %% +trainer.save_checkpoint("text_classification_model.pt") + + +# %% [markdown] +# ## Predicting +# **Load the model from a checkpoint** + +# %% +model = TextClassifier.load_from_checkpoint("text_classification_model.pt") + + +# %% [markdown] +# **Classify a few sentences! How was the movie?** + +# %% +datamodule = TextClassificationData.from_lists( + predict_data=[ + "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", + "The worst movie in the history of cinema.", + "I come from Bulgaria where it 's almost impossible to have a tornado.", + ], + batch_size=4, +) +predictions = trainer.predict(model, datamodule=datamodule) +print(predictions) diff --git a/lightning_examples/augmentation_kornia/.meta.yml b/lightning_examples/augmentation_kornia/.meta.yml new file mode 100644 index 0000000..7f5c11c --- /dev/null +++ b/lightning_examples/augmentation_kornia/.meta.yml @@ -0,0 +1,24 @@ +title: GPU and batched data augmentation with Kornia and PyTorch-Lightning +author: PL/Kornia team +created: 2021-06-11 +updated: 2023-03-15 +license: CC BY-SA +build: 0 +tags: + - Image +description: | + In this tutorial we will show how to combine both Kornia and PyTorch Lightning + to perform efficient data augmentation to train a simple model using the GPU in batch + mode without additional effort. +requirements: + - kornia + - lightning + - torchmetrics + - torchvision + - matplotlib + - pandas + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/lightning_examples/augmentation_kornia/.thumb.svg b/lightning_examples/augmentation_kornia/.thumb.svg new file mode 100644 index 0000000..481762a --- /dev/null +++ b/lightning_examples/augmentation_kornia/.thumb.svg @@ -0,0 +1,3 @@ + + + diff --git a/lightning_examples/augmentation_kornia/augmentation.py b/lightning_examples/augmentation_kornia/augmentation.py new file mode 100644 index 0000000..46ab969 --- /dev/null +++ b/lightning_examples/augmentation_kornia/augmentation.py @@ -0,0 +1,206 @@ +# %% +import os + +import lightning as L +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sn +import torch +import torch.nn as nn +import torchmetrics +import torchvision +from IPython.display import display +from kornia import image_to_tensor, tensor_to_image +from kornia.augmentation import ColorJitter, RandomChannelShuffle, RandomHorizontalFlip, RandomThinPlateSpline +from lightning.pytorch.loggers import CSVLogger +from torch import Tensor +from torch.nn import functional as F +from torch.utils.data import DataLoader +from torchvision.datasets import CIFAR10 + +sn.set() + +# %% [markdown] +# ## Define Data Augmentations module +# +# [Kornia](https://github.com/kornia/kornia) is low level Computer Vision library that provides a dedicated module +# [`kornia.augmentation`](https://kornia.readthedocs.io/en/latest/augmentation.html) module implementing +# en extensive set of data augmentation techniques for image and video. +# +# Similar to Lightning, in Kornia it's promoted to encapsulate functionalities inside classes for readability +# and efficiency purposes. In this case, we define a data augmentaton pipeline subclassing a `nn.Module` +# where the augmentation_kornia (also subclassing `nn.Module`) are combined with other PyTorch components +# such as `nn.Sequential`. +# +# Checkout the different augmentation operators in Kornia docs and experiment yourself! + + +# %% +class DataAugmentation(nn.Module): + """Module to perform data augmentation using Kornia on torch tensors.""" + + def __init__(self, apply_color_jitter: bool = False) -> None: + super().__init__() + self._apply_color_jitter = apply_color_jitter + + self.transforms = nn.Sequential( + RandomHorizontalFlip(p=0.75), + RandomChannelShuffle(p=0.75), + RandomThinPlateSpline(p=0.75), + ) + + self.jitter = ColorJitter(0.5, 0.5, 0.5, 0.5) + + @torch.no_grad() # disable gradients for effiency + def forward(self, x: Tensor) -> Tensor: + x_out = self.transforms(x) # BxCxHxW + if self._apply_color_jitter: + x_out = self.jitter(x_out) + return x_out + + +# %% [markdown] +# ## Define a Pre-processing module +# +# In addition to the `DataAugmentation` modudle that will sample random parameters during the training stage, +# we define a `Preprocess` class to handle the conversion of the image type to properly work with `Tensor`. +# +# For this example we use `torchvision` CIFAR10 which return samples of `PIL.Image`, however, +# to take all the advantages of PyTorch and Kornia we need to cast the images into tensors. +# +# To do that we will use `kornia.image_to_tensor` which casts and permutes the images in the right format. + + +# %% +class Preprocess(nn.Module): + """Module to perform pre-process using Kornia on torch tensors.""" + + @torch.no_grad() # disable gradients for effiency + def forward(self, x) -> Tensor: + x_tmp: np.ndarray = np.array(x) # HxWxC + x_out: Tensor = image_to_tensor(x_tmp, keepdim=True) # CxHxW + return x_out.float() / 255.0 + + +# %% [markdown] +# ## Define PyTorch Lightning model +# +# The next step is to define our `LightningModule` to have a proper organisation of our training pipeline. +# This is a simple example just to show how to structure your baseline to be used as a reference, +# do not expect a high performance. +# +# Notice that the `Preprocess` class is injected into the dataset and will be applied per sample. +# +# The interesting part in the proposed approach happens inside the `training_step` where with just a single +# line of code we apply the data augmentation in batch and no need to worry about the device. +# This means that our `DataAugmentation` pipeline will automatically executed in the GPU. + + +# %% +class CoolSystem(L.LightningModule): + def __init__(self): + super().__init__() + # not the best model: expereiment yourself + self.model = torchvision.models.resnet18(pretrained=True) + self.preprocess = Preprocess() # per sample transforms + self.transform = DataAugmentation() # per batch augmentation_kornia + self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=1000) + self.val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=1000) + + def forward(self, x): + return self.model(x) + + def compute_loss(self, y_hat, y): + return F.cross_entropy(y_hat, y) + + def show_batch(self, win_size=(10, 10)): + def _to_vis(data): + return tensor_to_image(torchvision.utils.make_grid(data, nrow=8)) + + # get a batch from the training set: try with `val_datlaoader` :) + imgs, labels = next(iter(self.train_dataloader())) + imgs_aug = self.transform(imgs) # apply transforms + # use matplotlib to visualize + plt.figure(figsize=win_size) + plt.imshow(_to_vis(imgs)) + plt.figure(figsize=win_size) + plt.imshow(_to_vis(imgs_aug)) + + def on_after_batch_transfer(self, batch, dataloader_idx): + x, y = batch + if self.trainer.training: + x = self.transform(x) # => we perform GPU/Batched data augmentation + return x, y + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = self.compute_loss(y_hat, y) + self.train_accuracy.update(y_hat, y) + self.log("train_loss", loss, prog_bar=False) + self.log("train_acc", self.train_accuracy, prog_bar=False) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = self.compute_loss(y_hat, y) + self.val_accuracy.update(y_hat, y) + self.log("valid_loss", loss, prog_bar=False) + self.log("valid_acc", self.val_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.AdamW(self.model.parameters(), lr=1e-4) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, self.trainer.max_epochs, 0) + return [optimizer], [scheduler] + + def prepare_data(self): + CIFAR10(os.getcwd(), train=True, download=True, transform=self.preprocess) + CIFAR10(os.getcwd(), train=False, download=True, transform=self.preprocess) + + def train_dataloader(self): + dataset = CIFAR10(os.getcwd(), train=True, download=True, transform=self.preprocess) + loader = DataLoader(dataset, batch_size=32) + return loader + + def val_dataloader(self): + dataset = CIFAR10(os.getcwd(), train=False, download=True, transform=self.preprocess) + loader = DataLoader(dataset, batch_size=32) + return loader + + +# %% [markdown] +# ## Visualize images + +# %% +# init model +model = CoolSystem() + +# %% +model.show_batch(win_size=(14, 14)) + +# %% [markdown] +# ## Run training + +# %% +# Initialize a trainer +trainer = L.Trainer( + accelerator="auto", + devices=1, + max_epochs=10, + logger=CSVLogger(save_dir="logs/"), +) + +# Train the model ⚡ +trainer.fit(model) + +# %% [markdown] +# ### Visualize the training results + +# %% +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") +del metrics["step"] +metrics.set_index("epoch", inplace=True) +display(metrics.dropna(axis=1, how="all").head()) +sn.relplot(data=metrics, kind="line") diff --git a/lightning_examples/barlow-twins/.meta.yml b/lightning_examples/barlow-twins/.meta.yml new file mode 100644 index 0000000..9a2227b --- /dev/null +++ b/lightning_examples/barlow-twins/.meta.yml @@ -0,0 +1,23 @@ +title: Barlow Twins Tutorial +author: Ananya Harsh Jha +created: 2021-09-19 +updated: 2023-03-15 +license: CC BY-SA +build: 0 +tags: + - Image + - Self-Supervised +description: | + This notebook describes the self-supervised learning method Barlow Twins. + Barlow Twins differs from other recently proposed algorithms as it doesn't + fall under the category of either contrastive learning, or methods like knowledge + distillation or clustering. The simplicity of the loss function and its effectiveness + in comparison to the current state of the art makes Barlow Twins an interesting + case study. +requirements: + - torchvision + - matplotlib + - lightning>=2.0.0rc0 +accelerator: + - GPU + - CPU diff --git a/lightning_examples/barlow-twins/barlow_twins.py b/lightning_examples/barlow-twins/barlow_twins.py new file mode 100644 index 0000000..6f34a67 --- /dev/null +++ b/lightning_examples/barlow-twins/barlow_twins.py @@ -0,0 +1,444 @@ +# %% [markdown] +# ## Barlow Twins +# +# Barlow Twins finds itself in unique place amongst the current state-of-the-art self-supervised learning methods. It does not fall under the existing categories of contrastive learning, knowledge distillation or clustering based methods. Instead, it creates its own category of redundancy reductionand achieves competitive performance with a simple yet effective loss function. In this tutorial, we look at coding up a small version of Barlow Twins algorithm using PyTorch Lightning. + +# %% +from functools import partial +from typing import Sequence, Tuple, Union + +import lightning as L +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.transforms as transforms +import torchvision.transforms.functional as VisionF +from lightning.pytorch.callbacks import Callback, ModelCheckpoint +from torch import Tensor +from torch.utils.data import DataLoader +from torchmetrics.functional import accuracy +from torchvision.datasets import CIFAR10 +from torchvision.models.resnet import resnet18 +from torchvision.utils import make_grid + +batch_size = 32 +num_workers = 0 # to run notebook on CPU +max_epochs = 200 +z_dim = 128 + + +# %% [markdown] +# ### Transforms +# +# We first define the data augmentation pipeline used in Barlow Twins. Here, we use pipeline proposed in SimCLR, which generates two copies/views of an input image by applying the following transformations in a sequence. +# +# First it takes a random crop of the image and resizes it to a fixed pre-specified size. Then, it applies a left-to-right random flip with a probability of 0.5. This step is followed by a composition of color jitter, conversion to grayscale with a probability of 0.2 and the application of a Gaussian blur filter. Finally, we normalize the image and convert it to a tensor. +# +# Within this transform, we add a third view for our online finetuner, which we explain later on. But, to explain things quickly here, we add a another transform to perform perform test our encoder on a downstream classification task. + + +# %% +class BarlowTwinsTransform: + def __init__(self, train=True, input_height=224, gaussian_blur=True, jitter_strength=1.0, normalize=None): + self.input_height = input_height + self.gaussian_blur = gaussian_blur + self.jitter_strength = jitter_strength + self.normalize = normalize + self.train = train + + color_jitter = transforms.ColorJitter( + 0.8 * self.jitter_strength, + 0.8 * self.jitter_strength, + 0.8 * self.jitter_strength, + 0.2 * self.jitter_strength, + ) + + color_transform = [transforms.RandomApply([color_jitter], p=0.8), transforms.RandomGrayscale(p=0.2)] + + if self.gaussian_blur: + kernel_size = int(0.1 * self.input_height) + if kernel_size % 2 == 0: + kernel_size += 1 + + color_transform.append(transforms.RandomApply([transforms.GaussianBlur(kernel_size=kernel_size)], p=0.5)) + + self.color_transform = transforms.Compose(color_transform) + + if normalize is None: + self.final_transform = transforms.ToTensor() + else: + self.final_transform = transforms.Compose([transforms.ToTensor(), normalize]) + + self.transform = transforms.Compose( + [ + transforms.RandomResizedCrop(self.input_height), + transforms.RandomHorizontalFlip(p=0.5), + self.color_transform, + self.final_transform, + ] + ) + + self.finetune_transform = None + if self.train: + self.finetune_transform = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4, padding_mode="reflect"), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + ] + ) + else: + self.finetune_transform = transforms.ToTensor() + + def __call__(self, sample): + return self.transform(sample), self.transform(sample), self.finetune_transform(sample) + + +# %% [markdown] +# ### Dataset +# +# We select CIFAR10 as the dataset to demonstrate the pre-training process for Barlow Twins. CIFAR10 images are 32x32 in size and we do not apply a Gaussian blur transformation on them. In this step, we create the training and validation dataloaders for CIFAR10. + + +# %% +def cifar10_normalization(): + normalize = transforms.Normalize( + mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]] + ) + return normalize + + +train_transform = BarlowTwinsTransform( + train=True, input_height=32, gaussian_blur=False, jitter_strength=0.5, normalize=cifar10_normalization() +) +train_dataset = CIFAR10(root=".", train=True, download=True, transform=train_transform) + +val_transform = BarlowTwinsTransform( + train=False, input_height=32, gaussian_blur=False, jitter_strength=0.5, normalize=cifar10_normalization() +) +val_dataset = CIFAR10(root=".", train=False, download=True, transform=train_transform) + +train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) +val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=True) + +# %% [markdown] +# ### Plot images +# +# To see how the CIFAR10 images look after the data augmentation pipeline, we load a few images from the dataloader and plot them here. + +# %% +for batch in val_loader: + (img1, img2, _), label = batch + break + +img_grid = make_grid(img1, normalize=True) + + +def show(imgs): + if not isinstance(imgs, list): + imgs = [imgs] + fix, axs = plt.subplots(ncols=len(imgs), squeeze=False) + for i, img in enumerate(imgs): + img = img.detach() + img = VisionF.to_pil_image(img) + axs[0, i].imshow(np.asarray(img)) + axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + + +show(img_grid) + + +# %% [markdown] +# ### Barlow Twins Loss +# +# Here we define the loss function for Barlow Twins. It first normalizes the D dimensinonal vectors from the projection head and then computes the DxD cross-correlation matrix between the normalized vectors of the 2 views of each image. +# +# Then it splits this cross-correlation matrix into two parts. The first part, the diagonal of this matrix is brought closer to 1, which pushes up the cosine similarity between the latent vectors of two views of each image, thus making the backbone invariant to the transformations applied to the views. The second part of the loss pushes the non-diagonal elements of the cross-corrlelation matrix closes to 0. This reduces the redundancy between the different dimensions of the latent vector. + + +# %% +class BarlowTwinsLoss(nn.Module): + def __init__(self, batch_size, lambda_coeff=5e-3, z_dim=128): + super().__init__() + + self.z_dim = z_dim + self.batch_size = batch_size + self.lambda_coeff = lambda_coeff + + def off_diagonal_ele(self, x): + # taken from: https://github.com/facebookresearch/barlowtwins/blob/main/main.py + # return a flattened view of the off-diagonal elements of a square matrix + n, m = x.shape + assert n == m + return x.flatten()[:-1].view(n - 1, n + 1)[:, 1:].flatten() + + def forward(self, z1, z2): + # N x D, where N is the batch size and D is output dim of projection head + z1_norm = (z1 - torch.mean(z1, dim=0)) / torch.std(z1, dim=0) + z2_norm = (z2 - torch.mean(z2, dim=0)) / torch.std(z2, dim=0) + + cross_corr = torch.matmul(z1_norm.T, z2_norm) / self.batch_size + + on_diag = torch.diagonal(cross_corr).add_(-1).pow_(2).sum() + off_diag = self.off_diagonal_ele(cross_corr).pow_(2).sum() + + return on_diag + self.lambda_coeff * off_diag + + +# %% [markdown] +# ### Backbone +# +# This is a standard Resnet backbone that we pre-train using the Barlow Twins method. To accommodate the 32x32 CIFAR10 images, we replace the first 7x7 convolution of the Resnet backbone by a 3x3 filter. We also remove the first Maxpool layer from the network for CIFAR10 images. + +# %% +encoder = resnet18() + +# for CIFAR10, replace the first 7x7 conv with smaller 3x3 conv and remove the first maxpool +encoder.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) +encoder.maxpool = nn.MaxPool2d(kernel_size=1, stride=1) + +# replace classification fc layer of Resnet to obtain representations from the backbone +encoder.fc = nn.Identity() + + +# %% [markdown] +# ### Projection head +# +# Unlike SimCLR and BYOL, the downstream performance of Barlow Twins greatly benefits from having a larger projection head after the backbone network. The paper utilizes a 3 layer MLP with 8192 hidden dimensions and 8192 as the output dimenion of the projection head. For the purposes of the tutorial, we use a smaller projection head. But, it is imperative to mention here that in practice, Barlow Twins needs to be trained using a bigger projection head as it is highly sensitive to its architecture and output dimensionality. + + +# %% +class ProjectionHead(nn.Module): + def __init__(self, input_dim=2048, hidden_dim=2048, output_dim=128): + super().__init__() + + self.projection_head = nn.Sequential( + nn.Linear(input_dim, hidden_dim, bias=True), + nn.BatchNorm1d(hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, output_dim, bias=False), + ) + + def forward(self, x): + return self.projection_head(x) + + +# %% [markdown] +# ### Learning rate warmup +# +# For the purposes of this tutorial, we keep things simple and use a linear warmup schedule with Adam optimizer. In our previous experiments we have found that linear warmup part is much more important for the final performance of a model than the cosine decay component of the schedule. + + +# %% +def fn(warmup_steps, step): + if step < warmup_steps: + return float(step) / float(max(1, warmup_steps)) + else: + return 1.0 + + +def linear_warmup_decay(warmup_steps): + return partial(fn, warmup_steps) + + +# %% [markdown] +# ### Barlow Twins Lightning Module +# +# We keep the LightningModule for Barlow Twins neat and simple. It takes in an backbone encoder and initializes the projection head and the loss function. We configure the optimizer and the learning rate scheduler in the ``configure_optimizers`` method. + + +# %% +class BarlowTwins(L.LightningModule): + def __init__( + self, + encoder, + encoder_out_dim, + num_training_samples, + batch_size, + lambda_coeff=5e-3, + z_dim=128, + learning_rate=1e-4, + warmup_epochs=10, + max_epochs=200, + ): + super().__init__() + + self.encoder = encoder + self.projection_head = ProjectionHead(input_dim=encoder_out_dim, hidden_dim=encoder_out_dim, output_dim=z_dim) + self.loss_fn = BarlowTwinsLoss(batch_size=batch_size, lambda_coeff=lambda_coeff, z_dim=z_dim) + + self.learning_rate = learning_rate + self.warmup_epochs = warmup_epochs + self.max_epochs = max_epochs + + self.train_iters_per_epoch = num_training_samples // batch_size + + def forward(self, x): + return self.encoder(x) + + def shared_step(self, batch): + (x1, x2, _), _ = batch + + z1 = self.projection_head(self.encoder(x1)) + z2 = self.projection_head(self.encoder(x2)) + + return self.loss_fn(z1, z2) + + def training_step(self, batch, batch_idx): + loss = self.shared_step(batch) + self.log("train_loss", loss, on_step=True, on_epoch=False) + return loss + + def validation_step(self, batch, batch_idx): + loss = self.shared_step(batch) + self.log("val_loss", loss, on_step=False, on_epoch=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + + warmup_steps = self.train_iters_per_epoch * self.warmup_epochs + + scheduler = { + "scheduler": torch.optim.lr_scheduler.LambdaLR( + optimizer, + linear_warmup_decay(warmup_steps), + ), + "interval": "step", + "frequency": 1, + } + + return [optimizer], [scheduler] + + +# %% [markdown] +# ### Evaluation +# +# We define a callback which appends a linear layer on top of the encoder and trains the classification evaluation head in an online manner. We make sure not to backpropagate the gradients back to the encoder while tuning the linear layer. This technique was used in SimCLR as well and they showed that the final downstream classification peformance is pretty much similar to the results on online finetuning as the training progresses. + + +# %% +class OnlineFineTuner(Callback): + def __init__( + self, + encoder_output_dim: int, + num_classes: int, + ) -> None: + super().__init__() + + self.optimizer: torch.optim.Optimizer + + self.encoder_output_dim = encoder_output_dim + self.num_classes = num_classes + + def on_fit_start(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None: + # add linear_eval layer and optimizer + pl_module.online_finetuner = nn.Linear(self.encoder_output_dim, self.num_classes).to(pl_module.device) + self.optimizer = torch.optim.Adam(pl_module.online_finetuner.parameters(), lr=1e-4) + + def extract_online_finetuning_view( + self, batch: Sequence, device: Union[str, torch.device] + ) -> Tuple[Tensor, Tensor]: + (_, _, finetune_view), y = batch + finetune_view = finetune_view.to(device) + y = y.to(device) + + return finetune_view, y + + def on_train_batch_end( + self, + trainer: L.Trainer, + pl_module: L.LightningModule, + outputs: Sequence, + batch: Sequence, + batch_idx: int, + ) -> None: + x, y = self.extract_online_finetuning_view(batch, pl_module.device) + + with torch.no_grad(): + feats = pl_module(x) + + feats = feats.detach() + preds = pl_module.online_finetuner(feats) + loss = F.cross_entropy(preds, y) + + loss.backward() + self.optimizer.step() + self.optimizer.zero_grad() + + acc = accuracy(F.softmax(preds, dim=1), y, task="multiclass", num_classes=10) + pl_module.log("online_train_acc", acc, on_step=True, on_epoch=False) + pl_module.log("online_train_loss", loss, on_step=True, on_epoch=False) + + def on_validation_batch_end( + self, + trainer: L.Trainer, + pl_module: L.LightningModule, + outputs: Sequence, + batch: Sequence, + batch_idx: int, + ) -> None: + x, y = self.extract_online_finetuning_view(batch, pl_module.device) + + with torch.no_grad(): + feats = pl_module(x) + + feats = feats.detach() + preds = pl_module.online_finetuner(feats) + loss = F.cross_entropy(preds, y) + + acc = accuracy(F.softmax(preds, dim=1), y, task="multiclass", num_classes=10) + pl_module.log("online_val_acc", acc, on_step=False, on_epoch=True, sync_dist=True) + pl_module.log("online_val_loss", loss, on_step=False, on_epoch=True, sync_dist=True) + + +# %% [markdown] +# Finally, we define the trainer for training the model. We pass in the ``train_loader`` and ``val_loader`` we had initialized earlier to the ``fit`` function. + +# %% +encoder_out_dim = 512 + +model = BarlowTwins( + encoder=encoder, + encoder_out_dim=encoder_out_dim, + num_training_samples=len(train_dataset), + batch_size=batch_size, + z_dim=z_dim, +) + +online_finetuner = OnlineFineTuner(encoder_output_dim=encoder_out_dim, num_classes=10) +checkpoint_callback = ModelCheckpoint(every_n_epochs=100, save_top_k=-1, save_last=True) + +trainer = L.Trainer( + max_epochs=max_epochs, + accelerator="auto", + devices=1, + callbacks=[online_finetuner, checkpoint_callback], +) + +# uncomment this to train the model +# this is done for the tutorial so that the notebook compiles +# trainer.fit(model, train_loader, val_loader) + +# %% [markdown] +# ### Using the trained encoder for downstream tasks +# +# Once the encoder is pretrained on CIFAR10, we can use it to get image embeddings and use them further downstream on tasks like classification, detection, segmentation etc. +# +# In this tutorial, we did not completely train our encoder for 100s of epochs using the Barlow Twins pretraining method. So, we will load the pretrained encoder weights from a checkpoint and show the image embeddings obtained from that. +# +# To create this checkpoint, the encoder was pretrained for 200 epochs, and obtained a online finetune accuracy of x% on CIFAR-10. + +# %% +# ckpt_model = torch.load('') # upload checkpoint to aws +# encoder = ckpt_model.encoder +encoder = model.encoder + +downstream_dataset = CIFAR10(root=".", train=False, transform=transforms.ToTensor()) +dataloader = DataLoader(downstream_dataset, batch_size=4, shuffle=False) + +for batch in dataloader: + img, label = batch + print(encoder(img).shape) + break diff --git a/lightning_examples/basic-gan/.meta.yaml b/lightning_examples/basic-gan/.meta.yaml new file mode 100644 index 0000000..dca0517 --- /dev/null +++ b/lightning_examples/basic-gan/.meta.yaml @@ -0,0 +1,20 @@ +title: PyTorch Lightning Basic GAN Tutorial +author: PL team +created: 2020-12-21 +updated: 2023-03-15 +license: CC BY-SA +build: 0 +tags: + - Image +description: | + How to train a GAN! + + Main takeaways: + 1. Generator and discriminator are arbitrary PyTorch modules. + 2. training_step does both the generator and discriminator training. +requirements: + - torchvision + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/lightning_examples/basic-gan/gan.py b/lightning_examples/basic-gan/gan.py new file mode 100644 index 0000000..b8e3827 --- /dev/null +++ b/lightning_examples/basic-gan/gan.py @@ -0,0 +1,266 @@ +# %% +import os + +import lightning as L +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision +import torchvision.transforms as transforms +from torch.utils.data import DataLoader, random_split +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +NUM_WORKERS = int(os.cpu_count() / 2) + +# %% [markdown] +# ### MNIST DataModule +# +# Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial +# on them or see the [latest release docs](https://lightning.ai/docs/pytorch/stable/data/datamodule.html). + + +# %% +class MNISTDataModule(L.LightningDataModule): + def __init__( + self, + data_dir: str = PATH_DATASETS, + batch_size: int = BATCH_SIZE, + num_workers: int = NUM_WORKERS, + ): + super().__init__() + self.data_dir = data_dir + self.batch_size = batch_size + self.num_workers = num_workers + + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + self.dims = (1, 28, 28) + self.num_classes = 10 + + def prepare_data(self): + # download + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform) + + def train_dataloader(self): + return DataLoader( + self.mnist_train, + batch_size=self.batch_size, + num_workers=self.num_workers, + ) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=self.num_workers) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers) + + +# %% [markdown] +# ### A. Generator + + +# %% +class Generator(nn.Module): + def __init__(self, latent_dim, img_shape): + super().__init__() + self.img_shape = img_shape + + def block(in_feat, out_feat, normalize=True): + layers = [nn.Linear(in_feat, out_feat)] + if normalize: + layers.append(nn.BatchNorm1d(out_feat, 0.8)) + layers.append(nn.LeakyReLU(0.2, inplace=True)) + return layers + + self.model = nn.Sequential( + *block(latent_dim, 128, normalize=False), + *block(128, 256), + *block(256, 512), + *block(512, 1024), + nn.Linear(1024, int(np.prod(img_shape))), + nn.Tanh(), + ) + + def forward(self, z): + img = self.model(z) + img = img.view(img.size(0), *self.img_shape) + return img + + +# %% [markdown] +# ### B. Discriminator + + +# %% +class Discriminator(nn.Module): + def __init__(self, img_shape): + super().__init__() + + self.model = nn.Sequential( + nn.Linear(int(np.prod(img_shape)), 512), + nn.LeakyReLU(0.2, inplace=True), + nn.Linear(512, 256), + nn.LeakyReLU(0.2, inplace=True), + nn.Linear(256, 1), + nn.Sigmoid(), + ) + + def forward(self, img): + img_flat = img.view(img.size(0), -1) + validity = self.model(img_flat) + + return validity + + +# %% [markdown] +# ### C. GAN +# +# #### A couple of cool features to check out in this example... +# +# - We use `some_tensor.type_as(another_tensor)` to make sure we initialize new tensors on the right device (i.e. GPU, CPU). +# - Lightning will put your dataloader data on the right device automatically +# - In this example, we pull from latent dim on the fly, so we need to dynamically add tensors to the right device. +# - `type_as` is the way we recommend to do this. +# - This example shows how to use multiple dataloaders in your `LightningModule`. + + +# %% +class GAN(L.LightningModule): + def __init__( + self, + channels, + width, + height, + latent_dim: int = 100, + lr: float = 0.0002, + b1: float = 0.5, + b2: float = 0.999, + batch_size: int = BATCH_SIZE, + **kwargs, + ): + super().__init__() + self.save_hyperparameters() + self.automatic_optimization = False + + # networks + data_shape = (channels, width, height) + self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=data_shape) + self.discriminator = Discriminator(img_shape=data_shape) + + self.validation_z = torch.randn(8, self.hparams.latent_dim) + + self.example_input_array = torch.zeros(2, self.hparams.latent_dim) + + def forward(self, z): + return self.generator(z) + + def adversarial_loss(self, y_hat, y): + return F.binary_cross_entropy(y_hat, y) + + def training_step(self, batch): + imgs, _ = batch + + optimizer_g, optimizer_d = self.optimizers() + + # sample noise + z = torch.randn(imgs.shape[0], self.hparams.latent_dim) + z = z.type_as(imgs) + + # train generator + # generate images + self.toggle_optimizer(optimizer_g) + self.generated_imgs = self(z) + + # log sampled images + sample_imgs = self.generated_imgs[:6] + grid = torchvision.utils.make_grid(sample_imgs) + self.logger.experiment.add_image("generated_images", grid, 0) + + # ground truth result (ie: all fake) + # put on GPU because we created this tensor inside training_loop + valid = torch.ones(imgs.size(0), 1) + valid = valid.type_as(imgs) + + # adversarial loss is binary cross-entropy + g_loss = self.adversarial_loss(self.discriminator(self(z)), valid) + self.log("g_loss", g_loss, prog_bar=True) + self.manual_backward(g_loss) + optimizer_g.step() + optimizer_g.zero_grad() + self.untoggle_optimizer(optimizer_g) + + # train discriminator + # Measure discriminator's ability to classify real from generated samples + self.toggle_optimizer(optimizer_d) + + # how well can it label as real? + valid = torch.ones(imgs.size(0), 1) + valid = valid.type_as(imgs) + + real_loss = self.adversarial_loss(self.discriminator(imgs), valid) + + # how well can it label as fake? + fake = torch.zeros(imgs.size(0), 1) + fake = fake.type_as(imgs) + + fake_loss = self.adversarial_loss(self.discriminator(self(z).detach()), fake) + + # discriminator loss is the average of these + d_loss = (real_loss + fake_loss) / 2 + self.log("d_loss", d_loss, prog_bar=True) + self.manual_backward(d_loss) + optimizer_d.step() + optimizer_d.zero_grad() + self.untoggle_optimizer(optimizer_d) + + def configure_optimizers(self): + lr = self.hparams.lr + b1 = self.hparams.b1 + b2 = self.hparams.b2 + + opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2)) + opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2)) + return [opt_g, opt_d], [] + + def on_validation_epoch_end(self): + z = self.validation_z.type_as(self.generator.model[0].weight) + + # log sampled images + sample_imgs = self(z) + grid = torchvision.utils.make_grid(sample_imgs) + self.logger.experiment.add_image("generated_images", grid, self.current_epoch) + + +# %% +dm = MNISTDataModule() +model = GAN(*dm.dims) +trainer = L.Trainer( + accelerator="auto", + devices=1, + max_epochs=5, +) +trainer.fit(model, dm) + +# %% +# Start tensorboard. +# %load_ext tensorboard +# %tensorboard --logdir lightning_logs/ diff --git a/lightning_examples/cifar10-baseline/.meta.yml b/lightning_examples/cifar10-baseline/.meta.yml new file mode 100644 index 0000000..6862531 --- /dev/null +++ b/lightning_examples/cifar10-baseline/.meta.yml @@ -0,0 +1,17 @@ +title: PyTorch Lightning CIFAR10 ~94% Baseline Tutorial +author: PL team +created: 2020-12-21 +updated: 2023-03-15 +license: CC BY-SA +build: 0 +tags: + - Image +description: > + Train a Resnet to 94% accuracy on Cifar10! +requirements: + - torchvision + - pandas + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - GPU diff --git a/lightning_examples/cifar10-baseline/baseline.py b/lightning_examples/cifar10-baseline/baseline.py new file mode 100644 index 0000000..9abc859 --- /dev/null +++ b/lightning_examples/cifar10-baseline/baseline.py @@ -0,0 +1,255 @@ +# %% +# Run this if you intend to use TPUs +# # !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl + +# %% +import os + +import lightning as L +import pandas as pd +import seaborn as sn +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision +from IPython.display import display +from lightning.pytorch.callbacks import LearningRateMonitor +from lightning.pytorch.loggers import CSVLogger +from torch.optim.lr_scheduler import OneCycleLR +from torch.optim.swa_utils import AveragedModel, update_bn +from torch.utils.data import DataLoader, random_split +from torchmetrics.functional import accuracy +from torchvision.datasets import CIFAR10 + +L.seed_everything(7) + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +NUM_WORKERS = int(os.cpu_count() / 2) + +# %% [markdown] +# ### CIFAR10 DataLoaders +# + +# %% + +cifar10_normalization = torchvision.transforms.Normalize( + mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], + std=[x / 255.0 for x in [63.0, 62.1, 66.7]], +) + + +def split_dataset(dataset, val_split=0.2, train=True): + """Splits the dataset into train and validation set.""" + len_dataset = len(dataset) + splits = get_splits(len_dataset, val_split) + dataset_train, dataset_val = random_split(dataset, splits, generator=torch.Generator().manual_seed(42)) + + if train: + return dataset_train + return dataset_val + + +def get_splits(len_dataset, val_split): + """Computes split lengths for train and validation set.""" + if isinstance(val_split, int): + train_len = len_dataset - val_split + splits = [train_len, val_split] + elif isinstance(val_split, float): + val_len = int(val_split * len_dataset) + train_len = len_dataset - val_len + splits = [train_len, val_len] + else: + raise ValueError(f"Unsupported type {type(val_split)}") + + return splits + + +train_transforms = torchvision.transforms.Compose( + [ + torchvision.transforms.RandomCrop(32, padding=4), + torchvision.transforms.RandomHorizontalFlip(), + torchvision.transforms.ToTensor(), + cifar10_normalization, + ] +) +test_transforms = torchvision.transforms.Compose( + [ + torchvision.transforms.ToTensor(), + cifar10_normalization, + ] +) + +dataset_train = CIFAR10(PATH_DATASETS, train=True, download=True, transform=train_transforms) +dataset_val = CIFAR10(PATH_DATASETS, train=True, download=True, transform=test_transforms) +dataset_train = split_dataset(dataset_train) +dataset_val = split_dataset(dataset_val, train=False) +dataset_test = CIFAR10(PATH_DATASETS, train=False, download=True, transform=test_transforms) + +train_dataloader = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) +val_dataloader = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS) +test_dataloader = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS) + + +# %% [markdown] +# ### Resnet +# Modify the pre-existing Resnet architecture from TorchVision. The pre-existing architecture is based on ImageNet +# images (224x224) as input. So we need to modify it for CIFAR10 images (32x32). + + +# %% +def create_model(): + model = torchvision.models.resnet18(pretrained=False, num_classes=10) + model.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + model.maxpool = nn.Identity() + return model + + +# %% [markdown] +# ### Lightning Module +# Check out the [`configure_optimizers`](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#configure-optimizers) +# method to use custom Learning Rate schedulers. The OneCycleLR with SGD will get you to around 92-93% accuracy +# in 20-30 epochs and 93-94% accuracy in 40-50 epochs. Feel free to experiment with different +# LR schedules from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + + +# %% +class LitResnet(L.LightningModule): + def __init__(self, lr=0.05): + super().__init__() + + self.save_hyperparameters() + self.model = create_model() + + def forward(self, x): + out = self.model(x) + return F.log_softmax(out, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + self.log("train_loss", loss) + return loss + + def evaluate(self, batch, stage=None): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + acc = accuracy(preds, y, task="multiclass", num_classes=10) + + if stage: + self.log(f"{stage}_loss", loss, prog_bar=True) + self.log(f"{stage}_acc", acc, prog_bar=True) + + def validation_step(self, batch, batch_idx): + self.evaluate(batch, "val") + + def test_step(self, batch, batch_idx): + self.evaluate(batch, "test") + + def configure_optimizers(self): + optimizer = torch.optim.SGD( + self.parameters(), + lr=self.hparams.lr, + momentum=0.9, + weight_decay=5e-4, + ) + steps_per_epoch = 45000 // BATCH_SIZE + scheduler_dict = { + "scheduler": OneCycleLR( + optimizer, + 0.1, + epochs=self.trainer.max_epochs, + steps_per_epoch=steps_per_epoch, + ), + "interval": "step", + } + return {"optimizer": optimizer, "lr_scheduler": scheduler_dict} + + +# %% +model = LitResnet(lr=0.05) + +trainer = L.Trainer( + max_epochs=30, + accelerator="auto", + devices=1, + logger=CSVLogger(save_dir="logs/"), + callbacks=[LearningRateMonitor(logging_interval="step")], +) + +trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader) +trainer.test(model, test_dataloader) + +# %% + +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") +del metrics["step"] +metrics.set_index("epoch", inplace=True) +display(metrics.dropna(axis=1, how="all").head()) +sn.relplot(data=metrics, kind="line") + +# %% [markdown] +# ### Bonus: Use [Stochastic Weight Averaging](https://arxiv.org/abs/1803.05407) to get a boost on performance +# +# Use SWA from torch.optim to get a quick performance boost. Also shows a couple of cool features from Lightning: +# - Use `training_epoch_end` to run code after the end of every epoch +# - Use a pretrained model directly with this wrapper for SWA + + +# %% +class SWAResnet(LitResnet): + def __init__(self, trained_model, lr=0.01): + super().__init__() + + self.save_hyperparameters("lr") + self.model = trained_model + self.swa_model = AveragedModel(self.model) + + def forward(self, x): + out = self.swa_model(x) + return F.log_softmax(out, dim=1) + + def on_train_epoch_end(self): + self.swa_model.update_parameters(self.model) + + def validation_step(self, batch, batch_idx, stage=None): + x, y = batch + logits = F.log_softmax(self.model(x), dim=1) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + acc = accuracy(preds, y, task="multiclass", num_classes=10) + + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", acc, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.model.parameters(), lr=self.hparams.lr, momentum=0.9, weight_decay=5e-4) + return optimizer + + def on_train_end(self): + update_bn(self.trainer.datamodule.train_dataloader(), self.swa_model, device=self.device) + + +# %% +swa_model = SWAResnet(model.model, lr=0.01) + +swa_trainer = L.Trainer( + max_epochs=20, + accelerator="auto", + devices=1, + logger=CSVLogger(save_dir="logs/"), +) + +swa_trainer.fit(swa_model, train_dataloader, val_dataloader=val_dataloader) +swa_trainer.test(swa_model, test_dataloader) + +# %% + +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") +del metrics["step"] +metrics.set_index("epoch", inplace=True) +display(metrics.dropna(axis=1, how="all").head()) +sn.relplot(data=metrics, kind="line") diff --git a/lightning_examples/datamodules/.meta.yml b/lightning_examples/datamodules/.meta.yml new file mode 100644 index 0000000..5c8fc0b --- /dev/null +++ b/lightning_examples/datamodules/.meta.yml @@ -0,0 +1,16 @@ +title: PyTorch Lightning DataModules +author: PL team +created: 2020-12-21 +updated: 2023-03-15 +license: CC BY-SA +build: 0 +description: This notebook will walk you through how to start using Datamodules. With + the release of `pytorch-lightning` version 0.9.0, we have included a new class called + `LightningDataModule` to help you decouple data related hooks from your `LightningModule`. + The most up-to-date documentation on datamodules can be found + [here](https://lightning.ai/docs/pytorch/stable/data/datamodule.html). +requirements: + - torchvision +accelerator: + - CPU + - GPU diff --git a/lightning_examples/datamodules/datamodules.py b/lightning_examples/datamodules/datamodules.py new file mode 100644 index 0000000..dd5b655 --- /dev/null +++ b/lightning_examples/datamodules/datamodules.py @@ -0,0 +1,339 @@ +# %% [markdown] +# ## Introduction +# +# First, we'll go over a regular `LightningModule` implementation without the use of a `LightningDataModule` + +# %% +import os + +import lightning as L +import torch +import torch.nn.functional as F +from torch import nn +from torch.utils.data import DataLoader, random_split +from torchmetrics.functional import accuracy +from torchvision import transforms + +# Note - you must have torchvision installed for this example +from torchvision.datasets import CIFAR10, MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 + +# %% [markdown] +# ### Defining the LitMNISTModel +# +# Below, we reuse a `LightningModule` from our hello world tutorial that classifies MNIST Handwritten Digits. +# +# Unfortunately, we have hardcoded dataset-specific items within the model, +# forever limiting it to working with MNIST Data. 😢 +# +# This is fine if you don't plan on training/evaluating your model on different datasets. +# However, in many cases, this can become bothersome when you want to try out your architecture with different datasets. + + +# %% +class LitMNIST(L.LightningModule): + def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): + super().__init__() + + # We hardcode dataset specific stuff here. + self.data_dir = data_dir + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Build model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + acc = accuracy(preds, y, task="multiclass", num_classes=10) + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", acc, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + # download + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=128) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=128) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=128) + + +# %% [markdown] +# ### Training the ListMNIST Model + +# %% +model = LitMNIST() +trainer = L.Trainer( + max_epochs=2, + accelerator="auto", + devices=1, +) +trainer.fit(model) + +# %% [markdown] +# ## Using DataModules +# +# DataModules are a way of decoupling data-related hooks from the `LightningModule +# ` so you can develop dataset agnostic models. + +# %% [markdown] +# ### Defining The MNISTDataModule +# +# Let's go over each function in the class below and talk about what they're doing: +# +# 1. ```__init__``` +# - Takes in a `data_dir` arg that points to where you have downloaded/wish to download the MNIST dataset. +# - Defines a transform that will be applied across train, val, and test dataset splits. +# - Defines default `self.dims`. +# +# +# 2. ```prepare_data``` +# - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there. +# - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`) +# +# 3. ```setup``` +# - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). +# - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'. +# - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage`. +# - **Note this runs across all GPUs and it *is* safe to make state assignments here** +# +# +# 4. ```x_dataloader``` +# - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()` + + +# %% +class MNISTDataModule(L.LightningDataModule): + def __init__(self, data_dir: str = PATH_DATASETS): + super().__init__() + self.data_dir = data_dir + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + self.dims = (1, 28, 28) + self.num_classes = 10 + + def prepare_data(self): + # download + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# %% [markdown] +# ### Defining the dataset agnostic `LitModel` +# +# Below, we define the same model as the `LitMNIST` model we made earlier. +# +# However, this time our model has the freedom to use any input data that we'd like 🔥. + + +# %% +class LitModel(L.LightningModule): + def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4): + super().__init__() + + # We take in input dimensions as parameters and use those to dynamically build model. + self.channels = channels + self.width = width + self.height = height + self.num_classes = num_classes + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, num_classes), + ) + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + acc = accuracy(preds, y, task="multiclass", num_classes=10) + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", acc, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + +# %% [markdown] +# ### Training the `LitModel` using the `MNISTDataModule` +# +# Now, we initialize and train the `LitModel` using the `MNISTDataModule`'s configuration settings and dataloaders. + +# %% +# Init DataModule +dm = MNISTDataModule() +# Init model from datamodule's attributes +model = LitModel(*dm.dims, dm.num_classes) +# Init trainer +trainer = L.Trainer( + max_epochs=3, + accelerator="auto", + devices=1, +) +# Pass the datamodule as arg to trainer.fit to override model hooks :) +trainer.fit(model, dm) + +# %% [markdown] +# ### Defining the CIFAR10 DataModule +# +# Lets prove the `LitModel` we made earlier is dataset agnostic by defining a new datamodule for the CIFAR10 dataset. + + +# %% +class CIFAR10DataModule(L.LightningDataModule): + def __init__(self, data_dir: str = "./"): + super().__init__() + self.data_dir = data_dir + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] + ) + + self.dims = (3, 32, 32) + self.num_classes = 10 + + def prepare_data(self): + # download + CIFAR10(self.data_dir, train=True, download=True) + CIFAR10(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform) + self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform) + + def train_dataloader(self): + return DataLoader(self.cifar_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.cifar_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.cifar_test, batch_size=BATCH_SIZE) + + +# %% [markdown] +# ### Training the `LitModel` using the `CIFAR10DataModule` +# +# Our model isn't very good, so it will perform pretty badly on the CIFAR10 dataset. +# +# The point here is that we can see that our `LitModel` has no problem using a different datamodule as its input data. + +# %% +dm = CIFAR10DataModule() +model = LitModel(*dm.dims, dm.num_classes, hidden_size=256) +trainer = L.Trainer( + max_epochs=5, + accelerator="auto", + devices=1, +) +trainer.fit(model, dm) diff --git a/lightning_examples/finetuning-scheduler/.meta.yml b/lightning_examples/finetuning-scheduler/.meta.yml new file mode 100644 index 0000000..156f434 --- /dev/null +++ b/lightning_examples/finetuning-scheduler/.meta.yml @@ -0,0 +1,20 @@ +title: Fine-Tuning Scheduler +author: "[Dan Dale](https://github.com/speediedan)" +created: 2021-11-29 +updated: 2023-04-06 +license: CC BY-SA +build: 0 +tags: + - Fine-Tuning +description: | + This notebook introduces the [Fine-Tuning Scheduler](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) extension + and demonstrates the use of it to fine-tune a small foundation model on the + [RTE](https://huggingface.co/datasets/viewer/?dataset=super_glue&config=rte) task of + [SuperGLUE](https://super.gluebenchmark.com/) with iterative early-stopping defined according to a user-specified + schedule. It uses Hugging Face's ``datasets`` and ``transformers`` libraries to retrieve the relevant benchmark data + and foundation model weights. The required dependencies are installed via the finetuning-scheduler ``[examples]`` extra. +requirements: + - finetuning-scheduler[examples]>=2.0.0 + - torch>=1.12.1 # to avoid https://github.com/pytorch/pytorch/issues/80809 with torch 1.12.0 +accelerator: + - GPU diff --git a/lightning_examples/finetuning-scheduler/RteBoolqModule_ft_schedule_deberta_base.yaml b/lightning_examples/finetuning-scheduler/RteBoolqModule_ft_schedule_deberta_base.yaml new file mode 100644 index 0000000..62bdbae --- /dev/null +++ b/lightning_examples/finetuning-scheduler/RteBoolqModule_ft_schedule_deberta_base.yaml @@ -0,0 +1,18 @@ + +0: + params: + - model.classifier.bias + - model.classifier.weight + - model.pooler.dense.bias + - model.pooler.dense.weight + - model.deberta.encoder.LayerNorm.bias + - model.deberta.encoder.LayerNorm.weight + - model.deberta.encoder.rel_embeddings.weight + - model.deberta.encoder.layer.{0,11}.(output|attention|intermediate).* +1: + params: + - model.deberta.embeddings.LayerNorm.bias + - model.deberta.embeddings.LayerNorm.weight +2: + params: + - model.deberta.embeddings.word_embeddings.weight diff --git a/lightning_examples/finetuning-scheduler/emphasized_yaml.png b/lightning_examples/finetuning-scheduler/emphasized_yaml.png new file mode 100644 index 0000000..492be1d Binary files /dev/null and b/lightning_examples/finetuning-scheduler/emphasized_yaml.png differ diff --git a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py new file mode 100644 index 0000000..637c45c --- /dev/null +++ b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py @@ -0,0 +1,668 @@ +# %% [markdown] +# ## Scheduled Fine-Tuning with the Fine-Tuning Scheduler Extension +# +# ![Fine-Tuning Scheduler logo](logo_fts.png){height="55px" width="401px"} +# +# The [Fine-Tuning Scheduler](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) extension accelerates and enhances model experimentation with flexible fine-tuning schedules. +# +# Training with the extension is simple and confers a host of benefits: +# +# - it dramatically increases fine-tuning flexibility +# - expedites and facilitates exploration of model tuning dynamics +# - enables marginal performance improvements of fine-tuned models +# +# Setup is straightforward, just install from PyPI! Since this notebook-based example requires a few additional packages (e.g. +# ``transformers``, ``sentencepiece``), we installed the ``finetuning-scheduler`` package with the ``[examples]`` extra above. +# Once the ``finetuning-scheduler`` package is installed, the [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) callback (FTS) is available for use with Lightning. +# For additional installation options, please see the Fine-Tuning Scheduler [README](https://github.com/speediedan/finetuning-scheduler/blob/main/README.md). +# +# +# +#
+# +# Fundamentally, [Fine-Tuning Scheduler](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) enables +# scheduled, multi-phase, fine-tuning of foundation models. Gradual unfreezing (i.e. thawing) can help maximize +# foundation model knowledge retention while allowing (typically upper layers of) the model to +# optimally adapt to new tasks during transfer learning [1, 2, 3](#f1) +# +#
+# +# The [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) callback orchestrates the gradual unfreezing +# of models via a fine-tuning schedule that is either implicitly generated (the default) or explicitly provided by the user +# (more computationally efficient). Fine-tuning phase transitions are driven by +# [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) +# criteria (a multi-phase extension of ``EarlyStopping`` packaged with FinetuningScheduler), user-specified epoch transitions or a composition of the two (the default mode). +# A [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) training session completes when the +# final phase of the schedule has its stopping criteria met. See +# the [early stopping documentation](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.EarlyStopping.html) for more details on that callback's configuration. +# +# ![FinetuningScheduler explicit loss animation](fts_explicit_loss_anim.gif){height="272px" width="376px"} + +# %% [markdown] +# +# ## Basic Usage +# +#
+# +# If no fine-tuning schedule is provided by the user, [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) will generate a +# [default schedule](#The-Default-Finetuning-Schedule) and proceed to fine-tune according to the generated schedule, +# using default [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) and [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) callbacks with ``monitor=val_loss``. +# +#
+# +# ```python +# import lightning as L +# from finetuning_scheduler import FinetuningScheduler +# trainer = L.Trainer(callbacks=[FinetuningScheduler()]) +# ``` + +# %% [markdown] +# ## The Default Fine-Tuning Schedule +# +# Schedule definition is facilitated via the [gen_ft_schedule](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.ScheduleImplMixin.gen_ft_schedule) method which dumps a default fine-tuning schedule (by default using a naive, 2-parameters per level heuristic) which can be adjusted as +# desired by the user and/or subsequently passed to the callback. Using the default/implicitly generated schedule will likely be less computationally efficient than a user-defined fine-tuning schedule but is useful for exploring a model's fine-tuning behavior and can serve as a good baseline for subsequent explicit schedule refinement. +# While the current version of [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) only supports single optimizer and (optional) lr_scheduler configurations, per-phase maximum learning rates can be set as demonstrated in the next section. + +# %% [markdown] +# ## Specifying a Fine-Tuning Schedule +# +# To specify a fine-tuning schedule, it's convenient to first generate the default schedule and then alter the thawed/unfrozen parameter groups associated with each fine-tuning phase as desired. Fine-tuning phases are zero-indexed and executed in ascending order. +# +# 1. First, generate the default schedule to ``Trainer.log_dir``. It will be named after your +# ``LightningModule`` subclass with the suffix ``_ft_schedule.yaml``. +# +# ```python +# import lightning as L +# from finetuning_scheduler import FinetuningScheduler +# trainer = L.Trainer(callbacks=[FinetuningScheduler(gen_ft_sched_only=True)]) +# ``` +# +# 2. Alter the schedule as desired. +# +# ![side_by_side_yaml](side_by_side_yaml.png){height="327px" width="800px"} +# +# 3. Once the fine-tuning schedule has been altered as desired, pass it to +# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) to commence scheduled training: +# +# ```python +# import lightning as L +# from finetuning_scheduler import FinetuningScheduler +# +# trainer = L.Trainer(callbacks=[FinetuningScheduler(ft_schedule="/path/to/my/schedule/my_schedule.yaml")]) +# ``` + +# %% [markdown] +# ## Early-Stopping and Epoch-Driven Phase Transition Criteria +# +# +# By default, [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) and epoch-driven +# transition criteria are composed. If a ``max_transition_epoch`` is specified for a given phase, the next fine-tuning phase will begin at that epoch unless [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) criteria are met first. +# If [FinetuningScheduler.epoch_transitions_only](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler.params.epoch_transitions_only) is ``True``, [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) will not be used +# and transitions will be exclusively epoch-driven. +# +# +#
+# +# **Tip:** Use of regex expressions can be convenient for specifying more complex schedules. Also, a per-phase base maximum lr can be specified: +# +# ![emphasized_yaml](emphasized_yaml.png){height="380px" width="800px"} +# +#
+# +# +# +# The end-to-end example in this notebook ([Scheduled Fine-Tuning For SuperGLUE](#superglue)) uses [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) in explicit mode to fine-tune a small foundation model on the [RTE](https://huggingface.co/datasets/viewer/?dataset=super_glue&config=rte) task of [SuperGLUE](https://super.gluebenchmark.com/). +# Please see the [official Fine-Tuning Scheduler documentation](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) if you are interested in a similar [CLI-based example](https://finetuning-scheduler.readthedocs.io/en/stable/index.html#example-scheduled-fine-tuning-for-superglue) using the LightningCLI. + +# %% [markdown] +# ## Resuming Scheduled Fine-Tuning Training Sessions +# +# Resumption of scheduled fine-tuning training is identical to the continuation of +# [other training sessions](https://lightning.ai/docs/pytorch/stable/common/trainer.html) with the caveat that the provided checkpoint must have been saved by a [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) session. +# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) uses [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) (an extension of ``ModelCheckpoint``) to maintain schedule state with special metadata. +# +# +# ```python +# import lightning as L +# from finetuning_scheduler import FinetuningScheduler +# trainer = L.Trainer(callbacks=[FinetuningScheduler()]) +# trainer.ckpt_path="some/path/to/my_checkpoint.ckpt" +# trainer.fit(...) +# ``` +# +# Training will resume at the depth/level of the provided checkpoint according to the specified schedule. Schedules can be altered between training sessions but schedule compatibility is left to the user for maximal flexibility. If executing a user-defined schedule, typically the same schedule should be provided for the original and resumed training sessions. +# +# By default ([FinetuningScheduler.restore_best](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=restore_best#finetuning_scheduler.fts.FinetuningScheduler.params.restore_best) is ``True``), [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) will attempt to restore the best available checkpoint before fine-tuning depth transitions. +# +# ```python +# trainer = L.Trainer(callbacks=[FinetuningScheduler()]) +# trainer.ckpt_path="some/path/to/my_kth_best_checkpoint.ckpt" +# trainer.fit(...) +# ``` +# +# Note that similar to the behavior of [ModelCheckpoint](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html), when resuming training with a +# different [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) ``dirpath`` from the provided +# checkpoint, the new training session's checkpoint state will be re-initialized at the resumption depth with the provided checkpoint being set as the best checkpoint. + +# %% [markdown] +#
+# +# **Note:** Currently, [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) supports the following distributed strategy types: +# +# - ``ddp`` (and aliases ``ddp_find_unused_parameters_false``, ``ddp_find_unused_parameters_true``, ``ddp_spawn``, ``ddp_fork``, ``ddp_notebook``) +# - ``fsdp`` (and alias ``fsdp_cpu_offload``) +# +# Custom or officially unsupported strategies can be used by setting [FinetuningScheduler.allow_untested](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=allow_untested#finetuning_scheduler.fts.FinetuningScheduler.params.allow_untested) to ``True``. +# Note that most currently unsupported strategies are so because they require varying degrees of modification to be compatible. For example, ``deepspeed`` will require a [StrategyAdapter](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.strategy_adapters.html#finetuning_scheduler.strategy_adapters.StrategyAdapter) to be written (similar to the one for ``FSDP``, [FSDPStrategyAdapter](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.strategy_adapters.html#finetuning_scheduler.strategy_adapters.FSDPStrategyAdapter)) before support can be added (PRs welcome!), +# while ``tpu_spawn`` would require an override of the current broadcast method to include python objects. +#
+ +# %% [markdown] +#
+# +# ## Scheduled Fine-Tuning For SuperGLUE +# +# The following example demonstrates the use of [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) to fine-tune a small foundation model on the [RTE](https://huggingface.co/datasets/viewer/?dataset=super_glue&config=rte) task of [SuperGLUE](https://super.gluebenchmark.com/). Iterative early-stopping will be applied according to a user-specified schedule. +# + +# %% +import os +import warnings +from datetime import datetime +from typing import Any, Dict, Optional + +import sentencepiece as sp # noqa: F401 # isort: split +import datasets +import evaluate +import lightning as L +import torch +from datasets import logging as datasets_logging +from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint +from lightning.pytorch.loggers.tensorboard import TensorBoardLogger +from lightning.pytorch.utilities import rank_zero_warn +from torch.optim.adamw import AdamW +from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts +from torch.utils.data import DataLoader +from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer +from transformers import logging as transformers_logging +from transformers.tokenization_utils_base import BatchEncoding + +# %% +# Import the `FinetuningScheduler` PyTorch Lightning extension module we want to use. This will import all necessary callbacks. +import finetuning_scheduler as fts # isort: split + +# set notebook-level variables +TASK_NUM_LABELS = {"boolq": 2, "rte": 2} +DEFAULT_TASK = "rte" + +# reduce hf logging verbosity to focus on tutorial-relevant code/messages +for hflogger in [transformers_logging, datasets_logging]: + hflogger.set_verbosity_error() +# ignore warnings related tokenizers_parallelism/DataLoader parallelism trade-off and +# expected logging behavior +for warnf in [ + r".*does not have many workers.*", + r".*The number of training samples.*", + r".*converting to a fast.*", + r".*number of training batches.*", +]: + warnings.filterwarnings("ignore", warnf) + + +# %% +class RteBoolqDataModule(L.LightningDataModule): + """A ``LightningDataModule`` designed for both the RTE or BoolQ SuperGLUE Hugging Face datasets.""" + + TASK_TEXT_FIELD_MAP = {"rte": ("premise", "hypothesis"), "boolq": ("question", "passage")} + LOADER_COLUMNS = ( + "datasets_idx", + "input_ids", + "token_type_ids", + "attention_mask", + "start_positions", + "end_positions", + "labels", + ) + + def __init__( + self, + model_name_or_path: str, + task_name: str = DEFAULT_TASK, + max_seq_length: int = 128, + train_batch_size: int = 16, + eval_batch_size: int = 16, + tokenizers_parallelism: bool = True, + **dataloader_kwargs: Any, + ): + r"""Initialize the ``LightningDataModule`` designed for both the RTE or BoolQ SuperGLUE Hugging Face + datasets. + + Args: + model_name_or_path (str): + Can be either: + - A string, the ``model id`` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced + under a user or organization name, like ``dbmdz/bert-base-german-cased``. + - A path to a ``directory`` containing model weights saved using + :meth:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. + task_name (str, optional): Name of the SuperGLUE task to execute. This module supports 'rte' or 'boolq'. + Defaults to DEFAULT_TASK which is 'rte'. + max_seq_length (int, optional): Length to which we will pad sequences or truncate input. Defaults to 128. + train_batch_size (int, optional): Training batch size. Defaults to 16. + eval_batch_size (int, optional): Batch size to use for validation and testing splits. Defaults to 16. + tokenizers_parallelism (bool, optional): Whether to use parallelism in the tokenizer. Defaults to True. + \**dataloader_kwargs: Arguments passed when initializing the dataloader. + """ + super().__init__() + task_name = task_name if task_name in TASK_NUM_LABELS.keys() else DEFAULT_TASK + self.text_fields = self.TASK_TEXT_FIELD_MAP[task_name] + self.dataloader_kwargs = { + "num_workers": dataloader_kwargs.get("num_workers", 0), + "pin_memory": dataloader_kwargs.get("pin_memory", False), + } + self.save_hyperparameters() + os.environ["TOKENIZERS_PARALLELISM"] = "true" if self.hparams.tokenizers_parallelism else "false" + self.tokenizer = AutoTokenizer.from_pretrained( + self.hparams.model_name_or_path, use_fast=True, local_files_only=False + ) + + def prepare_data(self): + """Load the SuperGLUE dataset.""" + # N.B. PL calls prepare_data from a single process (rank 0) so do not use it to assign + # state (e.g. self.x=y) + datasets.load_dataset("super_glue", self.hparams.task_name) + + def setup(self, stage): + """Setup our dataset splits for training/validation.""" + self.dataset = datasets.load_dataset("super_glue", self.hparams.task_name) + for split in self.dataset.keys(): + self.dataset[split] = self.dataset[split].map( + self._convert_to_features, batched=True, remove_columns=["label"] + ) + self.columns = [c for c in self.dataset[split].column_names if c in self.LOADER_COLUMNS] + self.dataset[split].set_format(type="torch", columns=self.columns) + + self.eval_splits = [x for x in self.dataset.keys() if "validation" in x] + + def train_dataloader(self): + return DataLoader(self.dataset["train"], batch_size=self.hparams.train_batch_size, **self.dataloader_kwargs) + + def val_dataloader(self): + return DataLoader(self.dataset["validation"], batch_size=self.hparams.eval_batch_size, **self.dataloader_kwargs) + + def _convert_to_features(self, example_batch: datasets.arrow_dataset.LazyDict) -> BatchEncoding: + """Convert raw text examples to a :class:`~transformers.tokenization_utils_base.BatchEncoding` container + (derived from python dict) of features that includes helpful methods for translating between word/character + space and token space. + + Args: + example_batch ([type]): The set of examples to convert to token space. + + Returns: + ``BatchEncoding``: A batch of encoded examples (note default tokenizer batch_size=1000). + """ + text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]])) + # Tokenize the text/text pairs + features = self.tokenizer.batch_encode_plus( + text_pairs, max_length=self.hparams.max_seq_length, padding="longest", truncation=True + ) + # Rename label to labels to make it easier to pass to model forward + features["labels"] = example_batch["label"] + return features + + +# %% +class RteBoolqModule(L.LightningModule): + """A ``LightningModule`` that can be used to fine-tune a foundation model on either the RTE or BoolQ SuperGLUE + tasks using Hugging Face implementations of a given model and the `SuperGLUE Hugging Face dataset.""" + + def __init__( + self, + model_name_or_path: str, + optimizer_init: Dict[str, Any], + lr_scheduler_init: Dict[str, Any], + model_cfg: Optional[Dict[str, Any]] = None, + task_name: str = DEFAULT_TASK, + experiment_tag: str = "default", + ): + """ + Args: + model_name_or_path (str): Path to pretrained model or identifier from https://huggingface.co/models. + optimizer_init (Dict[str, Any]): The desired optimizer configuration. + lr_scheduler_init (Dict[str, Any]): The desired learning rate scheduler config. + model_cfg (Optional[Dict[str, Any]], optional): Defines overrides of the default model config. Defaults to + ``None``. + task_name (str, optional): The SuperGLUE task to execute, one of ``'rte'``, ``'boolq'``. Defaults to "rte". + experiment_tag (str, optional): The tag to use for the experiment and tensorboard logs. Defaults to + "default". + """ + super().__init__() + if task_name not in TASK_NUM_LABELS.keys(): + rank_zero_warn(f"Invalid task_name {task_name!r}. Proceeding with the default task: {DEFAULT_TASK!r}") + task_name = DEFAULT_TASK + self.num_labels = TASK_NUM_LABELS[task_name] + self.model_cfg = model_cfg or {} + conf = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels, local_files_only=False) + self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=conf) + self.model.config.update(self.model_cfg) # apply model config overrides + self.init_hparams = { + "optimizer_init": optimizer_init, + "lr_scheduler_init": lr_scheduler_init, + "model_config": self.model.config, + "model_name_or_path": model_name_or_path, + "task_name": task_name, + "experiment_id": f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{experiment_tag}", + } + self.save_hyperparameters(self.init_hparams) + self.metric = evaluate.load("super_glue", self.hparams.task_name, experiment_id=self.hparams.experiment_id) + self.no_decay = ["bias", "LayerNorm.weight"] + + @property + def finetuningscheduler_callback(self) -> fts.FinetuningScheduler: + fts_callback = [c for c in self.trainer.callbacks if isinstance(c, fts.FinetuningScheduler)] + return fts_callback[0] if fts_callback else None + + def forward(self, **inputs): + return self.model(**inputs) + + def training_step(self, batch, batch_idx: int): + loss = self(**batch)[0] + self.log("train_loss", loss, prog_bar=True) + return loss + + def on_train_epoch_end(self): + if self.finetuningscheduler_callback: + self.log("finetuning_schedule_depth", float(self.finetuningscheduler_callback.curr_depth)) + + def validation_step(self, batch, batch_idx, dataloader_idx=0): + outputs = self(**batch) + val_loss, logits = outputs[:2] + if self.num_labels >= 1: + preds = torch.argmax(logits, axis=1) + elif self.num_labels == 1: + preds = logits.squeeze() + labels = batch["labels"] + self.log("val_loss", val_loss, prog_bar=True) + metric_dict = self.metric.compute(predictions=preds, references=labels) + self.log_dict(metric_dict, prog_bar=True) + + def configure_optimizers(self): + # With FTS >= 2.0, ``FinetuningScheduler`` simplifies initial optimizer configuration by ensuring the optimizer + # configured here will optimize the parameters (and only those parameters) scheduled to be optimized in phase 0 + # of the current fine-tuning schedule. This auto-configuration can be disabled if desired by setting + # ``enforce_phase0_params`` to ``False``. + optimizer = AdamW(params=self.model.parameters(), **self.hparams.optimizer_init) + scheduler = { + "scheduler": CosineAnnealingWarmRestarts(optimizer, **self.hparams.lr_scheduler_init), + "interval": "epoch", + } + return [optimizer], [scheduler] + + +# %% [markdown] +# ### Our Training Sessions +# +# We'll be comparing three different fine-tuning training configurations. Every configuration in this example depends +# upon a shared set of defaults, only differing in their respective fine-tuning schedules. +# +# | Experiment Tag | Training Scenario Description | +# |:-----------------:| ---------------------------------------------------------------------- | +# | ``fts_explicit`` | Training with a fine-tuning schedule explicitly provided by the user | +# | ``nofts_baseline``| A baseline fine-tuning training session (without scheduled fine-tuning) | +# | ``fts_implicit`` | Training with an implicitly generated fine-tuning schedule (the default) | +# +# Let's begin by configuring the ``fts_explicit`` scenario. We'll subsequently run the other two scenarios for +# comparison. + +# %% +# Let's create a fine-tuning schedule for our model and run an explicitly scheduled fine-tuning training scenario with it +# Please see the [FinetuningScheduler documentation](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) for a full description of the schedule format + + +ft_schedule_yaml = """ +0: + params: + - model.classifier.bias + - model.classifier.weight + - model.pooler.dense.bias + - model.pooler.dense.weight + - model.deberta.encoder.LayerNorm.bias + - model.deberta.encoder.LayerNorm.weight + - model.deberta.encoder.rel_embeddings.weight + - model.deberta.encoder.layer.{0,11}.(output|attention|intermediate).* +1: + params: + - model.deberta.embeddings.LayerNorm.bias + - model.deberta.embeddings.LayerNorm.weight +2: + params: + - model.deberta.embeddings.word_embeddings.weight +""" +ft_schedule_name = "RteBoolqModule_ft_schedule_deberta_base.yaml" +# Let's write the schedule to a file so we can simulate loading an explicitly defined fine-tuning +# schedule. +with open(ft_schedule_name, "w") as f: + f.write(ft_schedule_yaml) + +# %% +datasets.logging.disable_progress_bar() +L.seed_everything(42) +dm = RteBoolqDataModule(model_name_or_path="microsoft/deberta-v3-base", tokenizers_parallelism=True) + +# %% [markdown] +# ### Optimizer Configuration +# +#
+# +# Though other optimizers can arguably yield some marginal advantage contingent on the context, +# the Adam optimizer (and the [AdamW version](https://pytorch.org/docs/stable/_modules/torch/optim/adamw.html#AdamW) which +# implements decoupled weight decay) remains robust to hyperparameter choices and is commonly used for fine-tuning +# foundation language models. See [(Sivaprasad et al., 2020)](#f2) and [(Mosbach, Andriushchenko & Klakow, 2020)](#f3) for theoretical and systematic empirical justifications of Adam and its use in fine-tuning +# large transformer-based language models. The values used here have some justification +# in the referenced literature but have been largely empirically determined and while a good +# starting point could be could be further tuned. +# +#
+ +# %% +optimizer_init = {"weight_decay": 1e-05, "eps": 1e-07, "lr": 1e-05} + +# %% [markdown] +# ### LR Scheduler Configuration +# +#
+# +# The [CosineAnnealingWarmRestarts scheduler](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingWarmRestarts.html?highlight=cosineannealingwarm#torch.optim.lr_scheduler.CosineAnnealingWarmRestarts) nicely fits with our iterative fine-tuning since it does not depend upon a global max_epoch +# value. The importance of initial warmup is reduced due to the innate warmup effect of Adam bias correction [[5]](#f3) +# and the gradual thawing we are performing. Note that commonly used LR schedulers that depend on providing +# max_iterations/epochs (e.g. the +# [CosineWarmupScheduler](https://github.com/Lightning-AI/tutorials/blob/0c325829101d5a6ebf32ed99bbf5b09badf04a59/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py#L688) +# used in other pytorch-lightning tutorials) also work with FinetuningScheduler. Though the LR scheduler is theoretically +# justified [(Loshchilov & Hutter, 2016)](#f4), the particular values provided here are primarily empircally driven. +# +# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) also supports both optimizer and LR scheduler +# reinitialization in explicit and implicit finetuning schedule modes. See the advanced usage documentation ([LR scheduler reinitialization](https://finetuning-scheduler.readthedocs.io/en/stable/advanced/lr_scheduler_reinitialization.html), [optimizer reinitialization](https://finetuning-scheduler.readthedocs.io/en/stable/advanced/optimizer_reinitialization.html)) for explanations and demonstration of the extension's support for more complex requirements. +#
+ + +# %% +lr_scheduler_init = {"T_0": 1, "T_mult": 2, "eta_min": 1e-07} + +# %% +# Load our lightning module... +lightning_module_kwargs = { + "model_name_or_path": "microsoft/deberta-v3-base", + "optimizer_init": optimizer_init, + "lr_scheduler_init": lr_scheduler_init, +} +model = RteBoolqModule(**lightning_module_kwargs, experiment_tag="fts_explicit") + +# %% [markdown] +# ### Callback Configuration +# +# The only callback required to invoke the [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) is the [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) callback itself. +# Default versions of [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) and [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) +# (if not specifying ``epoch_only_transitions``) will be included ([as discussed above](#basic_usage)) if not provided +# in the callbacks list. For demonstration purposes I'm including example configurations of all three callbacks below. + +# %% +# let's save our callback configurations for the explicit scenario since we'll be reusing the same +# configurations for the implicit and nofts_baseline scenarios (except the config for the +# FinetuningScheduler callback itself of course in the case of nofts_baseline) +earlystopping_kwargs = {"monitor": "val_loss", "min_delta": 0.001, "patience": 2} +checkpoint_kwargs = {"monitor": "val_loss", "save_top_k": 1} +fts_kwargs = {"max_depth": 1} +callbacks = [ + fts.FinetuningScheduler(ft_schedule=ft_schedule_name, **fts_kwargs), + fts.FTSEarlyStopping(**earlystopping_kwargs), + fts.FTSCheckpoint(**checkpoint_kwargs), +] + +# %% +logger = TensorBoardLogger("lightning_logs", name="fts_explicit") +# optionally start tensorboard and monitor progress graphically while viewing multi-phase fine-tuning specific training +# logs in the cell output below by uncommenting the next 2 lines +# # %load_ext tensorboard +# # %tensorboard --logdir lightning_logs +# disable progress bar by default to focus on multi-phase training logs. Set to True to re-enable if desired +enable_progress_bar = False + +# %% + + +def train() -> None: + trainer = L.Trainer( + enable_progress_bar=enable_progress_bar, + max_epochs=100, + precision="16-mixed", + accelerator="auto", + devices=1, + callbacks=callbacks, + logger=logger, + ) + trainer.fit(model, datamodule=dm) + + +print( + "Note given the computation associated w/ the multiple phases of fine-tuning demonstrated, this notebook is best used with an accelerator" +) +train() + +# %% [markdown] +# ### Running the Baseline and Implicit Fine-Tuning Scenarios +# +# Let's now compare our ``nofts_baseline`` and ``fts_implicit`` scenarios with the ``fts_explicit`` one we just ran. +# +# We'll need to update our callbacks list, using the core PL ``EarlyStopping`` and ``ModelCheckpoint`` callbacks for the +# ``nofts_baseline`` (which operate identically to their FTS analogs apart from the recursive training support). +# For both core Lightning and user-registered callbacks, we can define our callbacks using a dictionary as we do +# with the LightningCLI. This allows us to avoid managing imports and support more complex configuration separated from +# code. +# +# Note that we'll be using identical callback configurations to the ``fts_explicit`` scenario. Keeping [max_depth](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=max_depth#finetuning_scheduler.fts.FinetuningScheduler.params.max_depth) for +# the implicit schedule will limit fine-tuning to just the last 4 parameters of the model, which is only a small fraction +# of the parameters you'd want to tune for maximum performance. Since the implicit schedule is quite computationally +# intensive and most useful for exploring model behavior, leaving [max_depth](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=max_depth#finetuning_scheduler.fts.FinetuningScheduler.params.max_depth) 1 allows us to demo implicit mode +# behavior while keeping the computational cost and runtime of this notebook reasonable. To review how a full implicit +# mode run compares to the ``nofts_baseline`` and ``fts_explicit`` scenarios, please see the the following +# [tensorboard experiment summary](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/). + + +# %% +nofts_callbacks = [EarlyStopping(**earlystopping_kwargs), ModelCheckpoint(**checkpoint_kwargs)] +fts_implicit_callbacks = [ + fts.FinetuningScheduler(**fts_kwargs), + fts.FTSEarlyStopping(**earlystopping_kwargs), + fts.FTSCheckpoint(**checkpoint_kwargs), +] +scenario_callbacks = {"nofts_baseline": nofts_callbacks, "fts_implicit": fts_implicit_callbacks} + +# %% +for scenario_name, scenario_callbacks in scenario_callbacks.items(): + model = RteBoolqModule(**lightning_module_kwargs, experiment_tag=scenario_name) + logger = TensorBoardLogger("lightning_logs", name=scenario_name) + callbacks = scenario_callbacks + print(f"Beginning training the '{scenario_name}' scenario") + train() + +# %% [markdown] +# ### Reviewing the Training Results +# +# See the [tensorboard experiment summaries](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/) to get a sense +# of the relative computational and performance tradeoffs associated with these [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) configurations. +# The summary compares a full ``fts_implicit`` execution to ``fts_explicit`` and ``nofts_baseline`` scenarios using DDP +# training with 2 GPUs. The full logs/schedules for all three scenarios are available +# [here](https://drive.google.com/file/d/1LrUcisRLHeJgh_BDOOD_GUBPp5iHAkoR/view?usp=sharing) and the checkpoints +# produced in the scenarios [here](https://drive.google.com/file/d/1t7myBgcqcZ9ax_IT9QVk-vFH_l_o5UXB/view?usp=sharing) +# (caution, ~3.5GB). +# +# [![fts_explicit_accuracy](fts_explicit_accuracy.png){height="315px" width="492px"}](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/#scalars&_smoothingWeight=0&runSelectionState=eyJmdHNfZXhwbGljaXQiOnRydWUsIm5vZnRzX2Jhc2VsaW5lIjpmYWxzZSwiZnRzX2ltcGxpY2l0IjpmYWxzZX0%3D) +# [![nofts_baseline](nofts_baseline_accuracy.png){height="316px" width="505px"}](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/#scalars&_smoothingWeight=0&runSelectionState=eyJmdHNfZXhwbGljaXQiOmZhbHNlLCJub2Z0c19iYXNlbGluZSI6dHJ1ZSwiZnRzX2ltcGxpY2l0IjpmYWxzZX0%3D) +# +# Note that given execution context differences, there could be a modest variation in performance from the tensorboard summaries generated by this notebook. +# +# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) expands the space of possible fine-tuning schedules and the composition of more sophisticated schedules can +# yield marginal fine-tuning performance gains. That stated, it should be emphasized the primary utility of [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) is to grant +# greater fine-tuning flexibility for model exploration in research. For example, glancing at DeBERTa-v3's implicit training +# run, a critical tuning transition point is immediately apparent: +# +# [![implicit_training_transition](implicit_training_transition.png){height="272px" width="494px"}](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/#scalars&_smoothingWeight=0&runSelectionState=eyJmdHNfZXhwbGljaXQiOmZhbHNlLCJub2Z0c19iYXNlbGluZSI6ZmFsc2UsImZ0c19pbXBsaWNpdCI6dHJ1ZX0%3D) +# +# Our `val_loss` begins a precipitous decline at step 3119 which corresponds to phase 17 in the schedule. Referring to our +# schedule, in phase 17 we're beginning tuning the attention parameters of our 10th encoder layer (of 11). Interesting! +# Though beyond the scope of this tutorial, it might be worth investigating these dynamics further and +# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) allows one to do just that quite easily. +# +# %% [markdown] +# +# Note that though this example is intended to capture a common usage scenario, substantial variation is expected +# among use cases and models. +# In summary, [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) provides increased fine-tuning flexibility that can be useful in a variety of +# contexts from exploring model tuning behavior to maximizing performance. +# %% [markdown] +# ## Footnotes +# +#
    +#
  1. +# +# [Howard, J., & Ruder, S. (2018)](https://arxiv.org/pdf/1801.06146.pdf). Fine-tuned Language +# Models for Text Classification. ArXiv, abs/1801.06146. [↩](#a1) +# +#
  2. +#
  3. +# +# [Chronopoulou, A., Baziotis, C., & Potamianos, A. (2019)](https://arxiv.org/pdf/1902.10547.pdf). +# An embarrassingly simple approach for transfer learning from pretrained language models. arXiv +# preprint arXiv:1902.10547. [↩](#a1) +# +#
  4. +#
  5. +# +# [Peters, M. E., Ruder, S., & Smith, N. A. (2019)](https://arxiv.org/pdf/1903.05987.pdf). To tune or not to +# tune? adapting pretrained representations to diverse tasks. arXiv preprint arXiv:1903.05987. [↩](#a1) +# +#
  6. +#
  7. +# +# [Sivaprasad, P. T., Mai, F., Vogels, T., Jaggi, M., & Fleuret, F. (2020)](https://arxiv.org/pdf/1910.11758.pdf). +# Optimizer benchmarking needs to account for hyperparameter tuning. In International Conference on Machine Learning +# (pp. 9036-9045). PMLR. [↩](#a2) +# +#
  8. +#
  9. +# +# [Mosbach, M., Andriushchenko, M., & Klakow, D. (2020)](https://arxiv.org/pdf/2006.04884.pdf). On the stability of +# fine-tuning bert: Misconceptions, explanations, and strong baselines. arXiv preprint arXiv:2006.04884. [↩](#a2) +# +#
  10. +#
  11. +# +# [Loshchilov, I., & Hutter, F. (2016)](https://arxiv.org/pdf/1608.03983.pdf). Sgdr: Stochastic gradient descent with +# warm restarts. arXiv preprint arXiv:1608.03983. [↩](#a3) +# +#
  12. +# +#
+ +# %% [markdown] +# diff --git a/lightning_examples/finetuning-scheduler/fts_explicit_accuracy.png b/lightning_examples/finetuning-scheduler/fts_explicit_accuracy.png new file mode 100644 index 0000000..b5d8f55 Binary files /dev/null and b/lightning_examples/finetuning-scheduler/fts_explicit_accuracy.png differ diff --git a/lightning_examples/finetuning-scheduler/fts_explicit_loss_anim.gif b/lightning_examples/finetuning-scheduler/fts_explicit_loss_anim.gif new file mode 100644 index 0000000..7451f65 Binary files /dev/null and b/lightning_examples/finetuning-scheduler/fts_explicit_loss_anim.gif differ diff --git a/lightning_examples/finetuning-scheduler/implicit_training_transition.png b/lightning_examples/finetuning-scheduler/implicit_training_transition.png new file mode 100644 index 0000000..6854dbf Binary files /dev/null and b/lightning_examples/finetuning-scheduler/implicit_training_transition.png differ diff --git a/lightning_examples/finetuning-scheduler/logo_fts.png b/lightning_examples/finetuning-scheduler/logo_fts.png new file mode 100644 index 0000000..02e14a3 Binary files /dev/null and b/lightning_examples/finetuning-scheduler/logo_fts.png differ diff --git a/lightning_examples/finetuning-scheduler/nofts_baseline_accuracy.png b/lightning_examples/finetuning-scheduler/nofts_baseline_accuracy.png new file mode 100644 index 0000000..b78f8c6 Binary files /dev/null and b/lightning_examples/finetuning-scheduler/nofts_baseline_accuracy.png differ diff --git a/lightning_examples/finetuning-scheduler/side_by_side_yaml.png b/lightning_examples/finetuning-scheduler/side_by_side_yaml.png new file mode 100644 index 0000000..3a32a1f Binary files /dev/null and b/lightning_examples/finetuning-scheduler/side_by_side_yaml.png differ diff --git a/lightning_examples/mnist-hello-world/.meta.yml b/lightning_examples/mnist-hello-world/.meta.yml new file mode 100644 index 0000000..ae9f221 --- /dev/null +++ b/lightning_examples/mnist-hello-world/.meta.yml @@ -0,0 +1,19 @@ +title: Introduction to PyTorch Lightning +author: PL team +created: 2020-12-21 +updated: 2023-05-15 +license: CC BY-SA +build: 0 +tags: + - Image +description: In this notebook, we'll go over the basics of lightning by preparing + models to train on the [MNIST Handwritten Digits dataset](https://en.wikipedia.org/wiki/MNIST_database). +requirements: + - torchvision + - torchmetrics >=0.11.0 + - pandas + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/lightning_examples/mnist-hello-world/hello-world.py b/lightning_examples/mnist-hello-world/hello-world.py new file mode 100644 index 0000000..fe77bd6 --- /dev/null +++ b/lightning_examples/mnist-hello-world/hello-world.py @@ -0,0 +1,481 @@ +# %% + +# ------------------- Preliminaries ------------------- # +import os +from dataclasses import dataclass +from typing import Tuple + +import lightning as L +import pandas as pd +import seaborn as sn +import torch +from IPython.display import display +from lightning.pytorch.loggers import CSVLogger +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +# ------------------- Configuration ------------------- # + + +@dataclass +class Config: + """Configuration options for the Lightning MNIST example. + + Args: + data_dir : The path to the directory where the MNIST dataset is stored. Defaults to the value of + the 'PATH_DATASETS' environment variable or '.' if not set. + + save_dir : The path to the directory where the training logs will be saved. Defaults to 'logs/'. + + batch_size : The batch size to use during training. Defaults to 256 if a GPU is available, + or 64 otherwise. + + max_epochs : The maximum number of epochs to train the model for. Defaults to 3. + + accelerator : The accelerator to use for training. Can be one of "cpu", "gpu", "tpu", "ipu", "auto". + + devices : The number of devices to use for training. Defaults to 1. + + Examples: + This dataclass can be used to specify the configuration options for training a PyTorch Lightning model on the + MNIST dataset. A new instance of this dataclass can be created as follows: + + >>> config = Config() + + The default values for each argument are shown in the documentation above. If desired, any of these values can be + overridden when creating a new instance of the dataclass: + + >>> config = Config(batch_size=128, max_epochs=5) + """ + + data_dir: str = os.environ.get("PATH_DATASETS", ".") + save_dir: str = "logs/" + batch_size: int = 256 if torch.cuda.is_available() else 64 + max_epochs: int = 3 + accelerator: str = "auto" + devices: int = 1 + + +config = Config() + +# %% [markdown] +# ## Simplest example +# +# Here's the simplest most minimal example with just a training loop (no validation, no testing). +# +# **Keep in Mind** - A `LightningModule` *is* a PyTorch `nn.Module` - it just has a few more helpful features. + + +# %% + + +class MNISTModel(L.LightningModule): + """A PyTorch Lightning module for classifying images in the MNIST dataset. + + Attributes: + l1 : A linear layer that maps input features to output features. + + Methods: + forward(x): + Performs a forward pass through the model. + + training_step(batch, batch_nb): + Defines a single training step for the model. + + configure_optimizers(): + Configures the optimizer to use during training. + + Examples: + The MNISTModel class can be used to create and train a PyTorch Lightning model for classifying images in the MNIST + dataset. To create a new instance of the model, simply instantiate the class: + + >>> model = MNISTModel() + + The model can then be trained using a PyTorch Lightning trainer object: + + >>> trainer = pl.Trainer() + >>> trainer.fit(model) + """ + + def __init__(self): + """Initializes a new instance of the MNISTModel class.""" + super().__init__() + self.l1 = torch.nn.Linear(28 * 28, 10) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Performs a forward pass through the model. + + Args: + x : The input tensor to pass through the model. + + Returns: + activated : The output tensor produced by the model. + + Examples: + >>> model = MNISTModel() + >>> x = torch.randn(1, 1, 28, 28) + >>> output = model(x) + """ + flattened = x.view(x.size(0), -1) + hidden = self.l1(flattened) + activated = torch.relu(hidden) + + return activated + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_nb: int) -> torch.Tensor: + """Defines a single training step for the model. + + Args: + batch: A tuple containing the input and target tensors for the batch. + batch_nb: The batch number. + + Returns: + torch.Tensor: The loss value for the current batch. + + Examples: + >>> model = MNISTModel() + >>> x = torch.randn(1, 1, 28, 28) + >>> y = torch.tensor([1]) + >>> loss = model.training_step((x, y), 0) + """ + x, y = batch + loss = F.cross_entropy(self(x), y) + return loss + + def configure_optimizers(self) -> torch.optim.Optimizer: + """Configures the optimizer to use during training. + + Returns: + torch.optim.Optimizer: The optimizer to use during training. + + Examples: + >>> model = MNISTModel() + >>> optimizer = model.configure_optimizers() + """ + return torch.optim.Adam(self.parameters(), lr=0.02) + + +# %% [markdown] +# By using the `Trainer` you automatically get: +# 1. Tensorboard logging +# 2. Model checkpointing +# 3. Training and validation loop +# 4. early-stopping + +# %% +# Init our model +mnist_model = MNISTModel() + +# Init DataLoader from MNIST Dataset +train_ds = MNIST(config.data_dir, train=True, download=True, transform=transforms.ToTensor()) + +# Create a dataloader +train_loader = DataLoader(train_ds, batch_size=config.batch_size) + +# Initialize a trainer +trainer = L.Trainer( + accelerator=config.accelerator, + devices=config.devices, + max_epochs=config.max_epochs, +) + +# Train the model ⚡ +trainer.fit(mnist_model, train_loader) + +# %% [markdown] +# ## A more complete MNIST Lightning Module Example +# +# That wasn't so hard was it? +# +# Now that we've got our feet wet, let's dive in a bit deeper and write a more complete `LightningModule` for MNIST... +# +# This time, we'll bake in all the dataset specific pieces directly in the `LightningModule`. +# This way, we can avoid writing extra code at the beginning of our script every time we want to run it. +# +# --- +# +# ### Note what the following built-in functions are doing: +# +# 1. [prepare_data()](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#prepare-data) 💾 +# - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there. +# - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`) +# +# 2. [setup(stage)](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#setup) ⚙️ +# - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). +# - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'. +# - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals). +# - **Note this runs across all GPUs and it *is* safe to make state assignments here** +# +# 3. [x_dataloader()](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.core.hooks.DataHooks.html#pytorch_lightning.core.hooks.DataHooks.train_dataloader) ♻️ +# - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()` + + +# %% + + +class LitMNIST(L.LightningModule): + """PyTorch Lightning module for training a multi-layer perceptron (MLP) on the MNIST dataset. + + Attributes: + data_dir : The path to the directory where the MNIST data will be downloaded. + + hidden_size : The number of units in the hidden layer of the MLP. + + learning_rate : The learning rate to use for training the MLP. + + Methods: + forward(x): + Performs a forward pass through the MLP. + + training_step(batch, batch_idx): + Defines a single training step for the MLP. + + validation_step(batch, batch_idx): + Defines a single validation step for the MLP. + + test_step(batch, batch_idx): + Defines a single testing step for the MLP. + + configure_optimizers(): + Configures the optimizer to use for training the MLP. + + prepare_data(): + Downloads the MNIST dataset. + + setup(stage=None): + Splits the MNIST dataset into train, validation, and test sets. + + train_dataloader(): + Returns a DataLoader for the training set. + + val_dataloader(): + Returns a DataLoader for the validation set. + + test_dataloader(): + Returns a DataLoader for the test set. + """ + + def __init__(self, data_dir: str = config.data_dir, hidden_size: int = 64, learning_rate: float = 2e-4): + """Initializes a new instance of the LitMNIST class. + + Args: + data_dir : The path to the directory where the MNIST data will be downloaded. Defaults to config.data_dir. + + hidden_size : The number of units in the hidden layer of the MLP (default is 64). + + learning_rate : The learning rate to use for training the MLP (default is 2e-4). + """ + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy(task="multiclass", num_classes=10) + self.test_accuracy = Accuracy(task="multiclass", num_classes=10) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Performs a forward pass through the MLP. + + Args: + x : The input data. + + Returns: + torch.Tensor: The output of the MLP. + """ + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_nb: int) -> torch.Tensor: + """Defines a single training step for the MLP. + + Args: + batch: A tuple containing the input data and target labels. + + batch_idx: The index of the current batch. + + Returns: + (torch.Tensor): The training loss. + """ + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_nb: int) -> None: + """Defines a single validation step for the MLP. + + Args: + batch : A tuple containing the input data and target labels. + batch_idx : The index of the current batch. + """ + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_nb: int) -> None: + """Defines a single testing step for the MLP. + + Args: + batch : A tuple containing the input data and target labels. + batch_idx : The index of the current batch. + """ + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self) -> torch.optim.Optimizer: + """Configures the optimizer to use for training the MLP. + + Returns: + torch.optim.Optimizer: The optimizer. + """ + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + + return optimizer + + # ------------------------------------- # + # DATA RELATED HOOKS + # ------------------------------------- # + + def prepare_data(self) -> None: + """Downloads the MNIST dataset.""" + MNIST(self.data_dir, train=True, download=True) + + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage: str = None) -> None: + """Splits the MNIST dataset into train, validation, and test sets. + + Args: + stage : The current stage (either "fit" or "test"). Defaults to None. + """ + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform) + + def train_dataloader(self) -> DataLoader: + """Returns a DataLoader for the training set. + + Returns: + DataLoader: The training DataLoader. + """ + return DataLoader(self.mnist_train, batch_size=config.batch_size) + + def val_dataloader(self) -> DataLoader: + """Returns a DataLoader for the validation set. + + Returns: + DataLoader: The validation DataLoader. + """ + return DataLoader(self.mnist_val, batch_size=config.batch_size) + + def test_dataloader(self) -> DataLoader: + """Returns a DataLoader for the test set. + + Returns: + DataLoader: The test DataLoader. + """ + return DataLoader(self.mnist_test, batch_size=config.batch_size) + + +# %% +# Instantiate the LitMNIST model +model = LitMNIST() + +# Instantiate a PyTorch Lightning trainer with the specified configuration +trainer = L.Trainer( + accelerator=config.accelerator, + devices=config.devices, + max_epochs=config.max_epochs, + logger=CSVLogger(save_dir=config.save_dir), +) + +# Train the model using the trainer +trainer.fit(model) + +# %% [markdown] +# ### Testing +# +# To test a model, call `trainer.test(model)`. +# +# Or, if you've just trained a model, you can just call `trainer.test()` and Lightning will automatically +# test using the best saved checkpoint (conditioned on val_loss). + +# %% +trainer.test(ckpt_path="best") + +# %% [markdown] +# ### Bonus Tip +# +# You can keep calling `trainer.fit(model)` as many times as you'd like to continue training + +# %% +trainer.fit(model) + +# %% [markdown] +# In Colab, you can use the TensorBoard magic function to view the logs that Lightning has created for you! + +# %% + +# Read in the training metrics from the CSV file generated by the logger +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") + +# Remove the "step" column, which is not needed for our analysis +del metrics["step"] + +# Set the epoch column as the index, for easier plotting +metrics.set_index("epoch", inplace=True) + +# Display the first few rows of the metrics table, excluding any columns with all NaN values +display(metrics.dropna(axis=1, how="all").head()) + +# Create a line plot of the training metrics using Seaborn +sn.relplot(data=metrics, kind="line") diff --git a/lightning_examples/mnist-tpu-training/.meta.yml b/lightning_examples/mnist-tpu-training/.meta.yml new file mode 100644 index 0000000..7c82362 --- /dev/null +++ b/lightning_examples/mnist-tpu-training/.meta.yml @@ -0,0 +1,16 @@ +title: TPU training with PyTorch Lightning +author: PL team +created: 2020-12-21 +updated: 2023-05-15 +license: CC BY-SA +build: 0 +tags: + - Image +description: In this notebook, we'll train a model on TPUs. Updating one Trainer flag is all you need for that. + The most up to documentation related to TPU training can be found + [here](https://lightning.ai/docs/pytorch/stable/accelerators/tpu.html). +requirements: + - torchvision + - lightning>=2.0.0rc0 +accelerator: + - TPU diff --git a/lightning_examples/mnist-tpu-training/mnist-tpu.py b/lightning_examples/mnist-tpu-training/mnist-tpu.py new file mode 100644 index 0000000..f0d7427 --- /dev/null +++ b/lightning_examples/mnist-tpu-training/mnist-tpu.py @@ -0,0 +1,174 @@ +# %% [markdown] +# ### Install Colab TPU compatible PyTorch/TPU wheels and dependencies + +# %% +# ! pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl + +import lightning as L + +# %% +import torch +import torch.nn.functional as F +from torch import nn +from torch.utils.data import DataLoader, random_split +from torchmetrics.functional import accuracy +from torchvision import transforms + +# Note - you must have torchvision installed for this example +from torchvision.datasets import MNIST + +BATCH_SIZE = 1024 + +# %% [markdown] +# ### Defining The `MNISTDataModule` +# +# Below we define `MNISTDataModule`. You can learn more about datamodules +# in [docs](https://lightning.ai/docs/pytorch/stable/data/datamodule.html). + + +# %% +class MNISTDataModule(L.LightningDataModule): + def __init__(self, data_dir: str = "./"): + super().__init__() + self.data_dir = data_dir + self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + + self.dims = (1, 28, 28) + self.num_classes = 10 + + def prepare_data(self): + # download + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# %% [markdown] +# ### Defining the `LitModel` +# +# Below, we define the model `LitMNIST`. + + +# %% +class LitModel(L.LightningModule): + def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4): + super().__init__() + + self.save_hyperparameters() + + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, num_classes), + ) + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + self.log("train_loss", loss) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + acc = accuracy(preds, y) + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", acc, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + return optimizer + + +# %% [markdown] +# ### TPU Training +# +# Lightning supports training on a single TPU core or 8 TPU cores. +# +# The Trainer parameter `devices` defines how many TPU cores to train on (1 or 8) / Single TPU core to train on [1] +# along with accelerator='tpu'. +# +# For Single TPU training, Just pass the TPU core ID [1-8] in a list. +# Setting `devices=[5]` will train on TPU core ID 5. + +# %% [markdown] +# Train on TPU core ID 5 with `devices=[5]`. + +# %% +# Init DataModule +dm = MNISTDataModule() +# Init model from datamodule's attributes +model = LitModel(*dm.size(), dm.num_classes) +# Init trainer +trainer = L.Trainer( + max_epochs=3, + accelerator="tpu", + devices=[5], +) +# Train +trainer.fit(model, dm) + +# %% [markdown] +# Train on single TPU core with `devices=1`. + +# %% +# Init DataModule +dm = MNISTDataModule() +# Init model from datamodule's attributes +model = LitModel(*dm.dims, dm.num_classes) +# Init trainer +trainer = L.Trainer( + max_epochs=3, + accelerator="tpu", + devices=1, +) +# Train +trainer.fit(model, dm) + +# %% [markdown] +# Train on 8 TPU cores with `accelerator='tpu'` and `devices=8`. +# You might have to restart the notebook to run it on 8 TPU cores after training on single TPU core. + +# %% +# Init DataModule +dm = MNISTDataModule() +# Init model from datamodule's attributes +model = LitModel(*dm.dims, dm.num_classes) +# Init trainer +trainer = L.Trainer( + max_epochs=3, + accelerator="tpu", + devices=8, +) +# Train +trainer.fit(model, dm) diff --git a/lightning_examples/reinforce-learning-DQN/.meta.yml b/lightning_examples/reinforce-learning-DQN/.meta.yml new file mode 100644 index 0000000..ac693b1 --- /dev/null +++ b/lightning_examples/reinforce-learning-DQN/.meta.yml @@ -0,0 +1,23 @@ +title: How to train a Deep Q Network +author: PL team +created: 2021-01-31 +updated: 2021-12-03 +license: CC BY-SA +build: 2 +tags: + - RL +description: | + Main takeaways: + + 1. RL has the same flow as previous models we have seen, with a few additions + 2. Handle unsupervised learning by using an IterableDataset where the dataset itself is constantly updated during training + 3. Each training step carries has the agent taking an action in the environment and storing the experience in the IterableDataset +requirements: + - gym <0.24 + - pygame + - pandas + - seaborn + - lightning>=2.0.0rc0 +accelerator: + - CPU + - GPU diff --git a/lightning_examples/reinforce-learning-DQN/dqn.py b/lightning_examples/reinforce-learning-DQN/dqn.py new file mode 100644 index 0000000..16a431a --- /dev/null +++ b/lightning_examples/reinforce-learning-DQN/dqn.py @@ -0,0 +1,382 @@ +# %% +import os +from collections import OrderedDict, deque, namedtuple +from typing import Iterator, List, Tuple + +import gym +import numpy as np +import pandas as pd +import seaborn as sn +import torch +from IPython.core.display import display +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.loggers import CSVLogger +from torch import Tensor, nn +from torch.optim import Adam, Optimizer +from torch.utils.data import DataLoader +from torch.utils.data.dataset import IterableDataset + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") + + +# %% +class DQN(nn.Module): + def __init__(self, obs_size: int, n_actions: int, hidden_size: int = 128): + """Simple MLP network. + + Args: + obs_size: observation/state size of the environment + n_actions: number of discrete actions available in the environment + hidden_size: size of hidden layers + """ + super().__init__() + self.net = nn.Sequential( + nn.Linear(obs_size, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, n_actions), + ) + + def forward(self, x): + return self.net(x.float()) + + +# %% [markdown] +# ### Memory + +# %% + +# Named tuple for storing experience steps gathered in training +Experience = namedtuple( + "Experience", + field_names=["state", "action", "reward", "done", "new_state"], +) + + +# %% +class ReplayBuffer: + """Replay Buffer for storing past experiences allowing the agent to learn from them. + + Args: + capacity: size of the buffer + """ + + def __init__(self, capacity: int) -> None: + self.buffer = deque(maxlen=capacity) + + def __len__(self) -> None: + return len(self.buffer) + + def append(self, experience: Experience) -> None: + """Add experience to the buffer. + + Args: + experience: tuple (state, action, reward, done, new_state) + """ + self.buffer.append(experience) + + def sample(self, batch_size: int) -> Tuple: + indices = np.random.choice(len(self.buffer), batch_size, replace=False) + states, actions, rewards, dones, next_states = zip(*(self.buffer[idx] for idx in indices)) + + return ( + np.array(states), + np.array(actions), + np.array(rewards, dtype=np.float32), + np.array(dones, dtype=bool), + np.array(next_states), + ) + + +# %% +class RLDataset(IterableDataset): + """Iterable Dataset containing the ExperienceBuffer which will be updated with new experiences during training. + + Args: + buffer: replay buffer + sample_size: number of experiences to sample at a time + """ + + def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None: + self.buffer = buffer + self.sample_size = sample_size + + def __iter__(self) -> Iterator[Tuple]: + states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size) + for i in range(len(dones)): + yield states[i], actions[i], rewards[i], dones[i], new_states[i] + + +# %% [markdown] +# ### Agent + + +# %% +class Agent: + def __init__(self, env: gym.Env, replay_buffer: ReplayBuffer) -> None: + """Base Agent class handeling the interaction with the environment. + + Args: + env: training environment + replay_buffer: replay buffer storing experiences + """ + self.env = env + self.replay_buffer = replay_buffer + self.reset() + self.state = self.env.reset() + + def reset(self) -> None: + """Resents the environment and updates the state.""" + self.state = self.env.reset() + + def get_action(self, net: nn.Module, epsilon: float, device: str) -> int: + """Using the given network, decide what action to carry out using an epsilon-greedy policy. + + Args: + net: DQN network + epsilon: value to determine likelihood of taking a random action + device: current device + + Returns: + action + """ + if np.random.random() < epsilon: + action = self.env.action_space.sample() + else: + state = torch.tensor([self.state]) + + if device not in ["cpu"]: + state = state.cuda(device) + + q_values = net(state) + _, action = torch.max(q_values, dim=1) + action = int(action.item()) + + return action + + @torch.no_grad() + def play_step( + self, + net: nn.Module, + epsilon: float = 0.0, + device: str = "cpu", + ) -> Tuple[float, bool]: + """Carries out a single interaction step between the agent and the environment. + + Args: + net: DQN network + epsilon: value to determine likelihood of taking a random action + device: current device + + Returns: + reward, done + """ + action = self.get_action(net, epsilon, device) + + # do step in the environment + # So, in the deprecated version of gym, the env.step() has 4 values unpacked which is + # obs, reward, done, info = env.step(action) + # In the latest version of gym, the step() function returns back an additional variable which is truncated. + # obs, reward, terminated, truncated, info = env.step(action) + new_state, reward, done, _ = self.env.step(action) + + exp = Experience(self.state, action, reward, done, new_state) + + self.replay_buffer.append(exp) + + self.state = new_state + if done: + self.reset() + return reward, done + + +# %% [markdown] +# ### DQN Lightning Module + + +# %% +class DQNLightning(LightningModule): + def __init__( + self, + batch_size: int = 16, + lr: float = 1e-2, + env: str = "CartPole-v0", + gamma: float = 0.99, + sync_rate: int = 10, + replay_size: int = 1000, + warm_start_size: int = 1000, + eps_last_frame: int = 1000, + eps_start: float = 1.0, + eps_end: float = 0.01, + episode_length: int = 200, + warm_start_steps: int = 1000, + ) -> None: + """Basic DQN Model. + + Args: + batch_size: size of the batches") + lr: learning rate + env: gym environment tag + gamma: discount factor + sync_rate: how many frames do we update the target network + replay_size: capacity of the replay buffer + warm_start_size: how many samples do we use to fill our buffer at the start of training + eps_last_frame: what frame should epsilon stop decaying + eps_start: starting value of epsilon + eps_end: final value of epsilon + episode_length: max length of an episode + warm_start_steps: max episode reward in the environment + """ + super().__init__() + self.save_hyperparameters() + + self.env = gym.make(self.hparams.env) + obs_size = self.env.observation_space.shape[0] + n_actions = self.env.action_space.n + + self.net = DQN(obs_size, n_actions) + self.target_net = DQN(obs_size, n_actions) + + self.buffer = ReplayBuffer(self.hparams.replay_size) + self.agent = Agent(self.env, self.buffer) + self.total_reward = 0 + self.episode_reward = 0 + self.populate(self.hparams.warm_start_steps) + + def populate(self, steps: int = 1000) -> None: + """Carries out several random steps through the environment to initially fill up the replay buffer with + experiences. + + Args: + steps: number of random steps to populate the buffer with + """ + for _ in range(steps): + self.agent.play_step(self.net, epsilon=1.0) + + def forward(self, x: Tensor) -> Tensor: + """Passes in a state x through the network and gets the q_values of each action as an output. + + Args: + x: environment state + + Returns: + q values + """ + output = self.net(x) + return output + + def dqn_mse_loss(self, batch: Tuple[Tensor, Tensor]) -> Tensor: + """Calculates the mse loss using a mini batch from the replay buffer. + + Args: + batch: current mini batch of replay data + + Returns: + loss + """ + states, actions, rewards, dones, next_states = batch + + state_action_values = self.net(states).gather(1, actions.long().unsqueeze(-1)).squeeze(-1) + + with torch.no_grad(): + next_state_values = self.target_net(next_states).max(1)[0] + next_state_values[dones] = 0.0 + next_state_values = next_state_values.detach() + + expected_state_action_values = next_state_values * self.hparams.gamma + rewards + + return nn.MSELoss()(state_action_values, expected_state_action_values) + + def get_epsilon(self, start: int, end: int, frames: int) -> float: + if self.global_step > frames: + return end + return start - (self.global_step / frames) * (start - end) + + def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict: + """Carries out a single step through the environment to update the replay buffer. Then calculates loss + based on the minibatch recieved. + + Args: + batch: current mini batch of replay data + nb_batch: batch number + + Returns: + Training loss and log metrics + """ + device = self.get_device(batch) + epsilon = self.get_epsilon(self.hparams.eps_start, self.hparams.eps_end, self.hparams.eps_last_frame) + self.log("epsilon", epsilon) + + # step through environment with agent + reward, done = self.agent.play_step(self.net, epsilon, device) + self.episode_reward += reward + self.log("episode reward", self.episode_reward) + + # calculates training loss + loss = self.dqn_mse_loss(batch) + + if done: + self.total_reward = self.episode_reward + self.episode_reward = 0 + + # Soft update of target network + if self.global_step % self.hparams.sync_rate == 0: + self.target_net.load_state_dict(self.net.state_dict()) + + self.log_dict( + { + "reward": reward, + "train_loss": loss, + } + ) + self.log("total_reward", self.total_reward, prog_bar=True) + self.log("steps", self.global_step, logger=False, prog_bar=True) + + return loss + + def configure_optimizers(self) -> List[Optimizer]: + """Initialize Adam optimizer.""" + optimizer = Adam(self.net.parameters(), lr=self.hparams.lr) + return optimizer + + def __dataloader(self) -> DataLoader: + """Initialize the Replay Buffer dataset used for retrieving experiences.""" + dataset = RLDataset(self.buffer, self.hparams.episode_length) + dataloader = DataLoader( + dataset=dataset, + batch_size=self.hparams.batch_size, + ) + return dataloader + + def train_dataloader(self) -> DataLoader: + """Get train loader.""" + return self.__dataloader() + + def get_device(self, batch) -> str: + """Retrieve device currently being used by minibatch.""" + return batch[0].device.index if self.on_gpu else "cpu" + + +# %% [markdown] +# ### Trainer + +# %% + +model = DQNLightning() + +trainer = Trainer( + accelerator="auto", + devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=150, + val_check_interval=50, + logger=CSVLogger(save_dir="logs/"), +) + +trainer.fit(model) + +# %% + +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") +del metrics["step"] +metrics.set_index("epoch", inplace=True) +display(metrics.dropna(axis=1, how="all").head()) +sn.relplot(data=metrics, kind="line") diff --git a/lightning_examples/text-transformers/.meta.yml b/lightning_examples/text-transformers/.meta.yml new file mode 100644 index 0000000..22b96a8 --- /dev/null +++ b/lightning_examples/text-transformers/.meta.yml @@ -0,0 +1,21 @@ +title: Finetune Transformers Models with PyTorch Lightning +author: PL team +created: 2021-01-31 +updated: 2023-03-17 +license: CC BY-SA +build: 0 +tags: + - Text +description: | + This notebook will use HuggingFace's `datasets` library to get data, which will be wrapped in a `LightningDataModule`. + Then, we write a class to perform text classification on any dataset from the [GLUE Benchmark](https://gluebenchmark.com/). + (We just show CoLA and MRPC due to constraint on compute/disk) +requirements: + - transformers + - datasets + - scipy + - scikit-learn + - torchtext>=0.9 + - lightning>=2.0.0rc0 +accelerator: + - GPU diff --git a/lightning_examples/text-transformers/text-transformers.py b/lightning_examples/text-transformers/text-transformers.py new file mode 100644 index 0000000..a570f82 --- /dev/null +++ b/lightning_examples/text-transformers/text-transformers.py @@ -0,0 +1,329 @@ +# %% +from collections import defaultdict +from datetime import datetime +from typing import Optional + +import datasets +import lightning as L +import torch +from torch.utils.data import DataLoader +from transformers import ( + AdamW, + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + get_linear_schedule_with_warmup, +) + +# %% [markdown] +# ## Training BERT with Lightning + +# %% [markdown] +# ### Lightning DataModule for GLUE + + +# %% +class GLUEDataModule(L.LightningDataModule): + task_text_field_map = { + "cola": ["sentence"], + "sst2": ["sentence"], + "mrpc": ["sentence1", "sentence2"], + "qqp": ["question1", "question2"], + "stsb": ["sentence1", "sentence2"], + "mnli": ["premise", "hypothesis"], + "qnli": ["question", "sentence"], + "rte": ["sentence1", "sentence2"], + "wnli": ["sentence1", "sentence2"], + "ax": ["premise", "hypothesis"], + } + + glue_task_num_labels = { + "cola": 2, + "sst2": 2, + "mrpc": 2, + "qqp": 2, + "stsb": 1, + "mnli": 3, + "qnli": 2, + "rte": 2, + "wnli": 2, + "ax": 3, + } + + loader_columns = [ + "datasets_idx", + "input_ids", + "token_type_ids", + "attention_mask", + "start_positions", + "end_positions", + "labels", + ] + + def __init__( + self, + model_name_or_path: str, + task_name: str = "mrpc", + max_seq_length: int = 128, + train_batch_size: int = 32, + eval_batch_size: int = 32, + **kwargs, + ): + super().__init__() + self.model_name_or_path = model_name_or_path + self.task_name = task_name + self.max_seq_length = max_seq_length + self.train_batch_size = train_batch_size + self.eval_batch_size = eval_batch_size + + self.text_fields = self.task_text_field_map[task_name] + self.num_labels = self.glue_task_num_labels[task_name] + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True) + + def setup(self, stage=None): + self.dataset = datasets.load_dataset("glue", self.task_name) + + for split in self.dataset.keys(): + self.dataset[split] = self.dataset[split].map( + self.convert_to_features, + batched=True, + remove_columns=["label"], + ) + self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns] + self.dataset[split].set_format(type="torch", columns=self.columns) + + self.eval_splits = [x for x in self.dataset.keys() if "validation" in x] + + def prepare_data(self): + datasets.load_dataset("glue", self.task_name) + AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True) + + def train_dataloader(self): + return DataLoader(self.dataset["train"], batch_size=self.train_batch_size, shuffle=True) + + def val_dataloader(self): + if len(self.eval_splits) == 1: + return DataLoader(self.dataset["validation"], batch_size=self.eval_batch_size) + elif len(self.eval_splits) > 1: + return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits] + + def test_dataloader(self): + if len(self.eval_splits) == 1: + return DataLoader(self.dataset["test"], batch_size=self.eval_batch_size) + elif len(self.eval_splits) > 1: + return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits] + + def convert_to_features(self, example_batch, indices=None): + # Either encode single sentence or sentence pairs + if len(self.text_fields) > 1: + texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]])) + else: + texts_or_text_pairs = example_batch[self.text_fields[0]] + + # Tokenize the text/text pairs + features = self.tokenizer.batch_encode_plus( + texts_or_text_pairs, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True + ) + + # Rename label to labels to make it easier to pass to model forward + features["labels"] = example_batch["label"] + + return features + + +# %% [markdown] +# **You could use this datamodule with standalone PyTorch if you wanted...** + +# %% +dm = GLUEDataModule("distilbert-base-uncased") +dm.prepare_data() +dm.setup("fit") +next(iter(dm.train_dataloader())) + +# %% [markdown] +# ### Transformer LightningModule + + +# %% +class GLUETransformer(L.LightningModule): + def __init__( + self, + model_name_or_path: str, + num_labels: int, + task_name: str, + learning_rate: float = 2e-5, + adam_epsilon: float = 1e-8, + warmup_steps: int = 0, + weight_decay: float = 0.0, + train_batch_size: int = 32, + eval_batch_size: int = 32, + eval_splits: Optional[list] = None, + **kwargs, + ): + super().__init__() + + self.save_hyperparameters() + + self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels) + self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config) + self.metric = datasets.load_metric( + "glue", self.hparams.task_name, experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S") + ) + self.outputs = defaultdict(list) + + def forward(self, **inputs): + return self.model(**inputs) + + def training_step(self, batch, batch_idx): + outputs = self(**batch) + loss = outputs[0] + return loss + + def validation_step(self, batch, batch_idx, dataloader_idx=0): + outputs = self(**batch) + val_loss, logits = outputs[:2] + + if self.hparams.num_labels > 1: + preds = torch.argmax(logits, axis=1) + elif self.hparams.num_labels == 1: + preds = logits.squeeze() + + labels = batch["labels"] + + self.outputs[dataloader_idx].append({"loss": val_loss, "preds": preds, "labels": labels}) + + def on_validation_epoch_end(self): + if self.hparams.task_name == "mnli": + for i, outputs in self.outputs.items(): + # matched or mismatched + split = self.hparams.eval_splits[i].split("_")[-1] + preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy() + labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy() + loss = torch.stack([x["loss"] for x in outputs]).mean() + self.log(f"val_loss_{split}", loss, prog_bar=True) + split_metrics = { + f"{k}_{split}": v for k, v in self.metric.compute(predictions=preds, references=labels).items() + } + self.log_dict(split_metrics, prog_bar=True) + return loss + + flat_outputs = [] + for lst in self.outputs.values(): + flat_outputs.extend(lst) + + preds = torch.cat([x["preds"] for x in flat_outputs]).detach().cpu().numpy() + labels = torch.cat([x["labels"] for x in flat_outputs]).detach().cpu().numpy() + loss = torch.stack([x["loss"] for x in flat_outputs]).mean() + self.log("val_loss", loss, prog_bar=True) + self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True) + self.outputs.clear() + + def configure_optimizers(self): + """Prepare optimizer and schedule (linear warmup and decay).""" + model = self.model + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": self.hparams.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) + + scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=self.hparams.warmup_steps, + num_training_steps=self.trainer.estimated_stepping_batches, + ) + scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} + return [optimizer], [scheduler] + + +# %% [markdown] +# ## Training + +# %% [markdown] +# ### CoLA +# +# See an interactive view of the +# CoLA dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola) + +# %% +L.seed_everything(42) + +dm = GLUEDataModule(model_name_or_path="albert-base-v2", task_name="cola") +dm.setup("fit") +model = GLUETransformer( + model_name_or_path="albert-base-v2", + num_labels=dm.num_labels, + eval_splits=dm.eval_splits, + task_name=dm.task_name, +) + +trainer = L.Trainer( + max_epochs=1, + accelerator="auto", + devices=1, +) +trainer.fit(model, datamodule=dm) + +# %% [markdown] +# ### MRPC +# +# See an interactive view of the +# MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mrpc) + +# %% +L.seed_everything(42) + +dm = GLUEDataModule( + model_name_or_path="distilbert-base-cased", + task_name="mrpc", +) +dm.setup("fit") +model = GLUETransformer( + model_name_or_path="distilbert-base-cased", + num_labels=dm.num_labels, + eval_splits=dm.eval_splits, + task_name=dm.task_name, +) + +trainer = L.Trainer( + max_epochs=3, + accelerator="auto", + devices=1, +) +trainer.fit(model, datamodule=dm) + +# %% [markdown] +# ### MNLI +# +# - The MNLI dataset is huge, so we aren't going to bother trying to train on it here. +# - We will skip over training and go straight to validation. +# +# See an interactive view of the +# MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mnli) + +# %% +dm = GLUEDataModule( + model_name_or_path="distilbert-base-cased", + task_name="mnli", +) +dm.setup("fit") +model = GLUETransformer( + model_name_or_path="distilbert-base-cased", + num_labels=dm.num_labels, + eval_splits=dm.eval_splits, + task_name=dm.task_name, +) + +trainer = L.Trainer( + max_epochs=3, + accelerator="auto", + devices=1, +) +trainer.validate(model, dm) diff --git a/lightning_examples/warp-drive/.meta.yml b/lightning_examples/warp-drive/.meta.yml new file mode 100644 index 0000000..adf93f0 --- /dev/null +++ b/lightning_examples/warp-drive/.meta.yml @@ -0,0 +1,30 @@ +title: Multi-agent Reinforcement Learning With WarpDrive +author: Sunil Srinivasa (sunil.srinivasa@salesforce.com), Tian Lan (tian.lan@salesforce.com), Huan Wang (huan.wang@salesforce.com) and Stephan Zheng(stephan.zheng@salesforce.com) +created: 2022-03-01 +license: BSD 3-Clause "New" or "Revised" License +tags: + - Reinforcement Learning + - Multi-agent + - GPU +description: This notebook introduces multi-agent reinforcement learning (MARL) with WarpDrive (Lan et al. https://arxiv.org/abs/2108.13976). + WarpDrive is a flexible, lightweight, and easy-to-use open-source framework that implements end-to-end deep MARL on GPUs. + WarpDrive enables orders-of-magnitude speedups compared to CPU-GPU implementations, using the parallelization capability + of GPUs and several design choices to minimize communication overhead. WarpDrive also prioritizes user-friendliness - + it has utility functions to easily build MARL environments in CUDA and quality-of-life tools to run end-to-end MARL + using just a few lines of code, and is compatible with PyTorch. + + WarpDrive includes the following resources. + code - https://github.com/salesforce/warp-drive + documentation - http://opensource.salesforce.com/warp-drive/, and + white paper - https://arxiv.org/abs/2108.13976. + +requirements: + - rl-warp-drive==2.1 + - ffmpeg-python + # todo: after merging #155 we will relax this just to `torch<=1.10` and drop TV, TT, etc. + - torch==1.10.* + - torchvision==0.11.* + - torchtext==0.11.* + - lightning>=2.0.0rc0 +accelerator: + - GPU diff --git a/lightning_examples/warp-drive/multi_agent_rl.py b/lightning_examples/warp-drive/multi_agent_rl.py new file mode 100644 index 0000000..33efbe2 --- /dev/null +++ b/lightning_examples/warp-drive/multi_agent_rl.py @@ -0,0 +1,261 @@ +# %% [markdown] +# **⚠️ PLEASE NOTE:** +# This notebook runs on a GPU runtime. If running on Colab, choose Runtime > Change runtime type from the menu, then select `GPU` in the 'Hardware accelerator' dropdown menu. + +# %% [markdown] +# ## Introduction + +# %% [markdown] +# This tutorial provides a demonstration of a multi-agent Reinforcement Learning (RL) training loop with [WarpDrive](https://github.com/salesforce/warp-drive). WarpDrive is a flexible, lightweight, and easy-to-use RL framework that implements end-to-end deep multi-agent RL on a GPU (Graphics Processing Unit). Using the extreme parallelization capability of GPUs, it enables [orders-of-magnitude faster RL](https://arxiv.org/abs/2108.13976) compared to common implementations that blend CPU simulations and GPU models. WarpDrive is extremely efficient as it runs simulations across multiple agents and multiple environment replicas all in parallel and completely eliminates the back-and-forth data copying between the CPU and the GPU during every step. As such, WarpDrive +# - Can simulate 1000s of agents in each environment and thousands of environments in parallel, harnessing the extreme parallelism capability of GPUs. +# - Eliminates communication between CPU and GPU, and also within the GPU, as read and write operations occur in-place. +# - Is fully compatible with PyTorch, a highly flexible and very fast deep learning framework. +# - Implements parallel action sampling on CUDA C, which is ~3x faster than using PyTorch’s sampling methods. +# - Allows for large-scale distributed training on multiple GPUs. +# +# Below is an overview of WarpDrive’s layout of computational and data structures on a single GPU. +# ![](https://blog.salesforceairesearch.com/content/images/2021/08/warpdrive_framework_overview.png) +# Computations are organized into blocks, with multiple threads in each block. Each block runs a simulation environment and each thread +# simulates an agent in an environment. Blocks can access the shared GPU memory that stores simulation data and neural network policy models. A DataManager and FunctionManager enable defining multi-agent RL GPU-workflows with Python APIs. For more details, please read out white [paper](https://arxiv.org/abs/2108.13976). +# +# The Warpdrive framework comprises several utility functions that help easily implement any (OpenAI-)*gym-style* RL environment, and furthermore, provides quality-of-life tools to train it end-to-end using just a few lines of code. You may familiarize yourself with WarpDrive with the help of these [tutorials](https://github.com/salesforce/warp-drive/tree/master/tutorials). +# +# We invite everyone to **contribute to WarpDrive**, including adding new multi-agent environments, proposing new features and reporting issues on our open source [repository](https://github.com/salesforce/warp-drive). +# +# We have integrated WarpDrive with the [PyTorch Lightning](https://www.lightning.ai/) framework, which greatly reduces the trainer boilerplate code, and improves training modularity and flexibility. It abstracts away most of the engineering pieces of code, so users can focus on research and building models, and iterate on experiments really fast. PyTorch Lightning also provides support for easily running the model on any hardware, performing distributed training, model checkpointing, performance profiling, logging and visualization. +# +# Below, we demonstrate how to use WarpDrive and PyTorch Lightning together to train a game of [Tag](https://github.com/salesforce/warp-drive/blob/master/example_envs/tag_continuous/tag_continuous.py) where multiple *tagger* agents are trying to run after and tag multiple other *runner* agents. Here's a sample depiction of the game of Tag with $100$ runners and $5$ taggers. +# ![](https://blog.salesforceairesearch.com/content/images/2021/08/same_speed_50fps-1.gif) + +# %% [markdown] +# ## Dependencies + +# %% +import logging + +import torch +from example_envs.tag_continuous.tag_continuous import TagContinuous +from pytorch_lightning import Trainer +from warp_drive.env_wrapper import EnvWrapper +from warp_drive.training.pytorch_lightning import CUDACallback, PerfStatsCallback, WarpDriveModule + +# Uncomment below for enabling animation visualizations. +# from example_envs.utils.generate_rollout_animation import generate_tag_env_rollout_animation +# from IPython.display import HTML + + +# %% +assert torch.cuda.device_count() > 0, "This notebook only runs on a GPU!" + +# %% +# Set logger level e.g., DEBUG, INFO, WARNING, ERROR. +logging.getLogger().setLevel(logging.ERROR) + +# %% [markdown] +# ## Specify a set of run configurations for your experiments +# +# The run configuration is a dictionary comprising the environment parameters, the trainer and the policy network settings, as well as configurations for saving. +# +# For our experiment, we consider an environment wherein $5$ taggers and $100$ runners play the game of [Tag](https://github.com/salesforce/warp-drive/blob/master/example_envs/tag_continuous/tag_continuous.py) on a $20 \times 20$ plane. The game lasts $200$ timesteps. Each agent chooses it's own acceleration and turn actions at every timestep, and we use mechanics to determine how the agents move over the grid. When a tagger gets close to a runner, the runner is tagged, and is eliminated from the game. For the configuration below, the runners and taggers have the same unit skill levels, or top speeds. +# +# We train the agents using $50$ environments or simulations running in parallel. With WarpDrive, each simulation runs on separate GPU blocks. +# +# There are two separate policy networks used for the tagger and runner agents. Each network is a fully-connected model with two layers each of $256$ dimensions. We use the Advantage Actor Critic (A2C) algorithm for training. WarpDrive also currently provides the option to use the Proximal Policy Optimization (PPO) algorithm instead. + +# %% +run_config = dict( + name="tag_continuous", + # Environment settings. + env=dict( + # number of taggers in the environment + num_taggers=5, + # number of runners in the environment + num_runners=100, + # length of the (square) grid on which the game is played + grid_length=20.0, + # episode length in timesteps + episode_length=200, + # maximum acceleration + max_acceleration=0.1, + # minimum acceleration + min_acceleration=-0.1, + # maximum turn (in radians) + max_turn=2.35, # 3pi/4 radians + # minimum turn (in radians) + min_turn=-2.35, # -3pi/4 radians + # number of discretized accelerate actions + num_acceleration_levels=10, + # number of discretized turn actions + num_turn_levels=10, + # skill level for the tagger + skill_level_tagger=1.0, + # skill level for the runner + skill_level_runner=1.0, + # each agent sees the full (or partial) information of the world + use_full_observation=False, + # flag to indicate if a runner stays in the game after getting tagged + runner_exits_game_after_tagged=True, + # number of other agents each agent can see + # used in the case use_full_observation is False + num_other_agents_observed=10, + # positive reward for a tagger upon tagging a runner + tag_reward_for_tagger=10.0, + # negative reward for a runner upon getting tagged + tag_penalty_for_runner=-10.0, + # reward at the end of the game for a runner that isn't tagged + end_of_game_reward_for_runner=1.0, + # distance margin between a tagger and runner + # to consider the runner as being 'tagged' + tagging_distance=0.02, + ), + # Trainer settings. + trainer=dict( + # number of environment replicas (number of GPU blocks used) + num_envs=50, + # total batch size used for training per iteration (across all the environments) + train_batch_size=10000, + # total number of episodes to run the training for + # This can be set arbitrarily high! + num_episodes=500, + ), + # Policy network settings. + policy=dict( + runner=dict( + # flag indicating whether the model needs to be trained + to_train=True, + # algorithm used to train the policy + algorithm="A2C", + # discount rate + gamma=0.98, + # learning rate + lr=0.005, + # policy model settings + model=dict(type="fully_connected", fc_dims=[256, 256], model_ckpt_filepath=""), + ), + tagger=dict( + to_train=True, + algorithm="A2C", + gamma=0.98, + lr=0.002, + model=dict(type="fully_connected", fc_dims=[256, 256], model_ckpt_filepath=""), + ), + ), + # Checkpoint saving setting. + saving=dict( + # how often (in iterations) to print the metrics + metrics_log_freq=10, + # how often (in iterations) to save the model parameters + model_params_save_freq=5000, + # base folder used for saving + basedir="/tmp", + # experiment name + name="continuous_tag", + # experiment tag + tag="example", + ), +) + +# %% [markdown] +# ## Instantiate the WarpDrive Module +# +# In order to instantiate the WarpDrive module, we first use an environment wrapper to specify that the environment needs to be run on the GPU (via the `use_cuda` flag). Also, agents in the environment can share policy models; so we specify a dictionary to map each policy network model to the list of agent ids using that model. + +# %% +# Create a wrapped environment object via the EnvWrapper +# Ensure that env_backend is set to be "pycuda" or "numba"(in order to run on the GPU) +# WarpDrive v2 supports JIT compiled Numba backend now! +env_wrapper = EnvWrapper( + TagContinuous(**run_config["env"]), + num_envs=run_config["trainer"]["num_envs"], + env_backend="pycuda", +) + +# Agents can share policy models: this dictionary maps policy model names to agent ids. +policy_tag_to_agent_id_map = { + "tagger": list(env_wrapper.env.taggers), + "runner": list(env_wrapper.env.runners), +} + +wd_module = WarpDriveModule( + env_wrapper=env_wrapper, + config=run_config, + policy_tag_to_agent_id_map=policy_tag_to_agent_id_map, + verbose=True, +) + + +# %% [markdown] +# ## Visualizing an episode roll-out before training +# +# We have created a helper function (see below) to visualize an episode rollout. Internally, this function uses the WarpDrive module's `fetch_episode_states` API to fetch the data arrays on the GPU for the duration of an entire episode. Specifically, we fetch the state arrays pertaining to agents' x and y locations on the plane and indicators on which agents are still active in the game. Note that this function may be invoked at any time during training, and it will use the state of the policy models at that time to sample actions and generate the visualization. + +# %% [markdown] +# The animation below shows a sample realization of the game episode before training, i.e., with randomly chosen agent actions. The $5$ taggers are marked in pink, while the $100$ blue agents are the runners. Both the taggers and runners move around randomly and about half the runners remain at the end of the episode. + +# %% +# Uncomment below for enabling animation visualizations. +# anim = generate_tag_env_rollout_animation(wd_module, fps=25) +# HTML(anim.to_html5_video()) + +# %% [markdown] +# ## Create the Lightning Trainer +# +# Next, we create the trainer for training the WarpDrive model. We add the `performance stats` callbacks to the trainer to view the throughput performance of WarpDrive. + +# %% +log_freq = run_config["saving"]["metrics_log_freq"] + +# Define callbacks. +cuda_callback = CUDACallback(module=wd_module) +perf_stats_callback = PerfStatsCallback( + batch_size=wd_module.training_batch_size, + num_iters=wd_module.num_iters, + log_freq=log_freq, +) + +# Instantiate the PyTorch Lightning trainer with the callbacks. +# Also, set the number of gpus to 1, since this notebook uses just a single GPU. +num_gpus = 1 +num_episodes = run_config["trainer"]["num_episodes"] +episode_length = run_config["env"]["episode_length"] +training_batch_size = run_config["trainer"]["train_batch_size"] +num_epochs = int(num_episodes * episode_length / training_batch_size) + +trainer = Trainer( + accelerator="gpu", + devices=num_gpus, + callbacks=[cuda_callback, perf_stats_callback], + max_epochs=num_epochs, + log_every_n_steps=1, + reload_dataloaders_every_n_epochs=1, +) + +# %% +# Start tensorboard. +# %load_ext tensorboard +# %tensorboard --logdir lightning_logs/ + +# %% [markdown] +# ## Train the WarpDrive Module +# +# Finally, we invoke training. +# +# Note: please scroll up to the tensorboard cell to visualize the curves during training. + +# %% +trainer.fit(wd_module) + +# %% [markdown] +# ## Visualize an episode-rollout after training + +# %% +# Uncomment below for enabling animation visualizations. +# anim = generate_tag_env_rollout_animation(wd_module, fps=25) +# HTML(anim.to_html5_video()) + +# %% [markdown] +# Note: In the configuration above, we have set the trainer to only train on $500$ rollout episodes, but you can increase the `num_episodes` configuration parameter to train further. As more training happens, the runners learn to escape the taggers, and the taggers learn to chase after the runner. Sometimes, the taggers also collaborate to team-tag runners. A good number of episodes to train on (for the configuration we have used) is $2$M or higher. + +# %% +# Finally, close the WarpDrive module to clear up the CUDA memory heap +wd_module.graceful_close() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d867478 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,99 @@ +[metadata] +license_file = "LICENSE" +description-file = "README.md" + + +[tool.check-manifest] +ignore = [ + "*.yml", + ".github", + ".github/*" +] + + +[tool.pytest.ini_options] +norecursedirs = [ + ".git", + ".github", + "dist", + "build", + "docs", +] +addopts = [ + "--strict-markers", + "--doctest-modules", + "--color=yes", + "--disable-pytest-warnings", +] +filterwarnings = [ + "error::FutureWarning", +] +xfail_strict = true +junit_duration_report = "call" + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "pass", +] + + +[tool.black] +# https://github.com/psf/black +line-length = 120 +exclude = "(.eggs|.git|.hg|.venv|_build|buck-out|build)" + +[tool.isort] +skip_glob = [] +profile = "black" +line_length = 120 + + +[tool.ruff] +line-length = 120 +# Enable Pyflakes `E` and `F` codes by default. +select = [ + "E", "W", # see: https://pypi.org/project/pycodestyle + "F", # see: https://pypi.org/project/pyflakes +# "D", # see: https://pypi.org/project/pydocstyle +# "N", # see: https://pypi.org/project/pep8-naming +] +#extend-select = [ +# "C4", # see: https://pypi.org/project/flake8-comprehensions +# "PT", # see: https://pypi.org/project/flake8-pytest-style +# "RET", # see: https://pypi.org/project/flake8-return +# "SIM", # see: https://pypi.org/project/flake8-simplify +#] +ignore = [ + "E731", # Do not assign a lambda expression, use a def + # TODO: we shall format all long comments as it comes from text cells + "E501", # Line too long +] +# Exclude a variety of commonly ignored directories. +exclude = [ + ".eggs", + ".git", + ".ruff_cache", + "__pypackages__", + "_build", + "build", + "dist", + "docs" +] +ignore-init-module-imports = true + +[tool.ruff.per-file-ignores] +"setup.py" = ["D100", "SIM115"] +"__about__.py" = ["D100"] +"__init__.py" = ["D100"] + +[tool.ruff.pydocstyle] +# Use Google-style docstrings. +convention = "google" + +[tool.ruff.pycodestyle] +ignore-overlong-task-comments = true + +[tool.ruff.mccabe] +# Unlike Flake8, default to a complexity level of 10. +max-complexity = 10 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9624fab --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +-r _requirements/devel.txt +-r .actions/requires.txt + +# default for all examples +-r _requirements/default.txt diff --git a/templates/img-classify/.meta.yml b/templates/img-classify/.meta.yml new file mode 100644 index 0000000..b26049d --- /dev/null +++ b/templates/img-classify/.meta.yml @@ -0,0 +1,21 @@ +title: Simple image classification with Lightning Flash +author: PL team +created: 2022-04-14 +updated: 2021-06-16 +license: CC BY-SA +build: 2 +tags: + - Image +description: | + This is a template to show simple image classification case if for some reason accelerator is required. +requirements: + - lightning-flash[image]>=0.7 + - numpy<1.24 + - pandas>=1.0 + - matplotlib>=3.0 + - seaborn +accelerator: + - GPU +datasets: + web: + - https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip diff --git a/templates/img-classify/classify.py b/templates/img-classify/classify.py new file mode 100644 index 0000000..ba0273b --- /dev/null +++ b/templates/img-classify/classify.py @@ -0,0 +1,70 @@ +# %% +import os + +import flash +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sn +from flash.image import ImageClassificationData, ImageClassifier +from IPython.core.display import display +from pytorch_lightning.loggers import CSVLogger + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +# this dataset is automatically downloaded and extracted based on meta link +# this archive includes the one more level - folder with the same name +DATA_HYMENOPLERA = os.path.join(PATH_DATASETS, "hymenoptera_data", "hymenoptera_data") + +# %% [markdown] +# ## 1. Create the DataModule + +# %% +datamodule = ImageClassificationData.from_folders( + train_folder=f"{DATA_HYMENOPLERA}/train/", + val_folder=f"{DATA_HYMENOPLERA}/val/", + batch_size=1024, +) + +# %% [markdown] +# ## 2. Build the task + +# %% +model = ImageClassifier(backbone="resnet18", labels=datamodule.labels) + +# %% [markdown] +# ## 3. Create the trainer and finetune the model + +# %% +logger = CSVLogger(save_dir="logs/") +trainer = flash.Trainer(logger=logger, max_epochs=3, gpus=1) +trainer.finetune(model, datamodule=datamodule, strategy="freeze") + +# %% +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") +del metrics["step"] +metrics.set_index("epoch", inplace=True) +display(metrics.dropna(axis=1, how="all").head()) + +g = sn.relplot(data=metrics, kind="line") +plt.gcf().set_size_inches(12, 4) +plt.grid() + +# %% [markdown] +# ## 4. Predict what's on a few images! ants or bees? + +# %% +datamodule = ImageClassificationData.from_files( + predict_files=[ + f"{DATA_HYMENOPLERA}/val/bees/65038344_52a45d090d.jpg", + f"{DATA_HYMENOPLERA}/val/bees/590318879_68cf112861.jpg", + f"{DATA_HYMENOPLERA}/val/ants/540543309_ddbb193ee5.jpg", + ], + batch_size=3, +) +predictions = trainer.predict(model, datamodule=datamodule, output="labels") +print(predictions) + +# %% [markdown] +# ## 5. Save the model! + +# %% +trainer.save_checkpoint("image_classification_model.pt") diff --git a/templates/simple/.meta.yml b/templates/simple/.meta.yml new file mode 100644 index 0000000..db12f93 --- /dev/null +++ b/templates/simple/.meta.yml @@ -0,0 +1,13 @@ +title: How to write a PyTorch Lightning tutorial +author: PL team +created: 2021-06-15 +updated: 2021-06-17 +license: CC +build: 10 +description: | + This is a template to show how to contribute a tutorial. +requirements: + - matplotlib +accelerator: + - CPU + - GPU diff --git a/templates/simple/.thumb.png b/templates/simple/.thumb.png new file mode 100644 index 0000000..e778f65 Binary files /dev/null and b/templates/simple/.thumb.png differ diff --git a/templates/simple/template.py b/templates/simple/template.py new file mode 100644 index 0000000..9f30ea0 --- /dev/null +++ b/templates/simple/template.py @@ -0,0 +1,45 @@ +# %% [markdown] +# ## Create a Markdown cell +# +# `# %% [markdown]` +# +# the content of single cell shall be connected with `# ` at each line, so for example: +# `# Add some text that will be rendered as markdown text.` + +# %% [markdown] +# ## Create a code cell +# +# `# %%` + +# %% +import torch + +print(torch.__version__) + +# %% [markdown] +# ## Add any Python codes +# Easy integration with Python ecosystem libraries component. +# +# For example create a simple plot with `matplotlib` with an image: +# +# ![test image](test.png) +# +# From: https://matplotlib.org/stable/gallery/lines_bars_and_markers/simple_plot.html + +# %% +import matplotlib.pyplot as plt # noqa: E402 +import numpy as np # noqa: E402 + +# Data for plotting +t = np.arange(0.0, 2.0, 0.01) +s = 1 + np.sin(2 * np.pi * t) + +fig, ax = plt.subplots() +ax.plot(t, s) + +ax.set(xlabel="time (s)", ylabel="voltage (mV)", title="About as simple as it gets, folks") +ax.grid() + +fig.savefig("test.png") +# render image to the notebooks +plt.show() diff --git a/templates/simple/test.png b/templates/simple/test.png new file mode 100644 index 0000000..ceee8c3 Binary files /dev/null and b/templates/simple/test.png differ diff --git a/templates/titanic/.meta.yml b/templates/titanic/.meta.yml new file mode 100644 index 0000000..dbe94d7 --- /dev/null +++ b/templates/titanic/.meta.yml @@ -0,0 +1,18 @@ +title: Solving Titanic dataset with Lightning Flash +author: PL team +created: 2021-10-15 +updated: 2021-12-10 +license: CC +build: 0 +description: | + This is a template to show how to contribute a tutorial. +requirements: + - https://github.com/PyTorchLightning/lightning-flash/archive/refs/tags/0.5.2.zip#egg=lightning-flash[tabular] + - matplotlib + - seaborn +accelerator: + - CPU + - GPU +datasets: + kaggle: + - titanic diff --git a/templates/titanic/tutorial.py b/templates/titanic/tutorial.py new file mode 100644 index 0000000..a82976f --- /dev/null +++ b/templates/titanic/tutorial.py @@ -0,0 +1,101 @@ +import os + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +import torch +from flash import Trainer +from flash.tabular import TabularClassificationData, TabularClassifier + +# %% [markdown] +# ## 1. Create the DataModule +# +# ### Variable & Definition +# +# - survival: Survival (0 = No, 1 = Yes) +# - pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd) +# - sex: Sex +# - Age: Age in years +# - sibsp: number of siblings / spouses aboard the Titanic +# - parch: number of parents / children aboard the Titanic +# - ticket: Ticket number +# - fare: Passenger fare +# - cabin: Cabin number +# - embarked: Port of Embarkation + +# %% +data_path = os.environ.get("PATH_DATASETS", "_datasets") +path_titanic = os.path.join(data_path, "titanic") +csv_train = os.path.join(path_titanic, "train.csv") +csv_test = os.path.join(path_titanic, "test.csv") + +df_train = pd.read_csv(csv_train) +df_train["Survived"].hist(bins=2) + +# %% +datamodule = TabularClassificationData.from_csv( + categorical_fields=["Sex", "Embarked", "Cabin"], + numerical_fields=["Fare", "Age", "Pclass", "SibSp", "Parch"], + target_fields="Survived", + train_file=csv_train, + val_split=0.1, + batch_size=8, +) + +# %% [markdown] +# ## 2. Build the task + +# %% +model = TabularClassifier.from_data( + datamodule, + learning_rate=0.1, + optimizer="Adam", + n_a=8, + gamma=0.3, +) + +# %% [markdown] +# ## 3. Create the trainer and train the model + +# %% +from pytorch_lightning.loggers import CSVLogger # noqa: E402] + +logger = CSVLogger(save_dir="logs/") +trainer = Trainer( + max_epochs=10, + gpus=torch.cuda.device_count(), + logger=logger, + accumulate_grad_batches=12, + gradient_clip_val=0.1, +) + +# %% + +trainer.fit(model, datamodule=datamodule) + +# %% + +metrics = pd.read_csv(f"{trainer.logger.log_dir}/metrics.csv") +metrics.set_index("step", inplace=True) +del metrics["epoch"] +sns.relplot(data=metrics, kind="line") +plt.gca().set_ylim([0, 1.25]) +plt.gcf().set_size_inches(10, 5) + +# %% [markdown] +# ## 4. Generate predictions from a CSV + +# %% +df_test = pd.read_csv(csv_test) + +predictions = model.predict(csv_test) +print(predictions[0]) + +# %% +import numpy as np # noqa: E402] + +assert len(df_test) == len(predictions) + +df_test["Survived"] = np.argmax(predictions, axis=-1) +df_test.set_index("PassengerId", inplace=True) +df_test["Survived"].hist(bins=5)