diff --git a/Project.toml b/Project.toml
index 25aa3fe8..6d24ed8b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Chmy"
 uuid = "33a72cf0-4690-46d7-b987-06506c2248b9"
 authors = ["Ivan Utkin <iutkin@ethz.ch>, Ludovic Raess <ludovic.rass@gmail.com>, and contributors"]
-version = "0.1.16"
+version = "0.1.17"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/README.md b/README.md
index 5c3f0ee2..f3bb143b 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@
 
 Hey, that's Chmy.jl 🚀
 
+Chmy.jl provides a backend-agnostic toolkit for finite difference computations on multi-dimensional computational staggered grids. Chmy.jl features task-based distributed memory parallelisation capabilities.
+
 ## Documentation
 Checkout the [documentation](https://PTsolvers.github.io/Chmy.jl/dev) for API reference and usage.
 
diff --git a/docs/make.jl b/docs/make.jl
index 77aea23d..88ba7faa 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -8,14 +8,26 @@ makedocs(
     authors="Ivan Utkin, Ludovic Räss and contributors",
     format = Documenter.HTML(
         prettyurls=get(ENV, "CI", nothing) == "true", # easier local build
-        ansicolor=true
+        ansicolor=true,
+        assets = ["assets/favicon.ico"],
         ),
     modules = [Chmy],
     warnonly = [:missing_docs],
     pages = Any[
         "Home" => "index.md",
-        "Usage" => Any["usage/runtests.md"],
-        "Library" => Any["lib/modules.md"]
+        "Getting Started with Chmy.jl" => "getting_started.md",
+        "Concepts" => Any["concepts/architectures.md",
+                        "concepts/grids.md",
+                        "concepts/fields.md",
+                        "concepts/bc.md",
+                        "concepts/grid_operators.md",
+                        "concepts/kernels.md"
+        ],
+        "Examples" => Any["examples/overview.md"
+        ],
+        "Library" => Any["lib/modules.md"],
+        "Developer documentation" => Any["developer_documentation/running_tests.md",
+                                         "developer_documentation/workers.md"],
     ]
 )
 
diff --git a/docs/src/assets/diffusion_2d_it_100.png b/docs/src/assets/diffusion_2d_it_100.png
new file mode 100644
index 00000000..9a00a42c
Binary files /dev/null and b/docs/src/assets/diffusion_2d_it_100.png differ
diff --git a/docs/src/assets/favicon.ico b/docs/src/assets/favicon.ico
new file mode 100644
index 00000000..b31d1f38
Binary files /dev/null and b/docs/src/assets/favicon.ico differ
diff --git a/docs/src/assets/field_set_ic_gaussian.png b/docs/src/assets/field_set_ic_gaussian.png
new file mode 100644
index 00000000..d5d8c0be
Binary files /dev/null and b/docs/src/assets/field_set_ic_gaussian.png differ
diff --git a/docs/src/assets/field_set_ic_random.png b/docs/src/assets/field_set_ic_random.png
new file mode 100644
index 00000000..b591d69e
Binary files /dev/null and b/docs/src/assets/field_set_ic_random.png differ
diff --git a/docs/src/assets/field_type_tree.svg b/docs/src/assets/field_type_tree.svg
new file mode 100644
index 00000000..a7ee7b7b
--- /dev/null
+++ b/docs/src/assets/field_type_tree.svg
@@ -0,0 +1,21 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 906.0706309116662 579.5833331097583" width="2718.2118927349984" height="1738.749999329275">
+  <!-- svg-source:excalidraw -->
+  
+  <defs>
+    <style class="style-fonts">
+      @font-face {
+        font-family: "Virgil";
+        src: url("https://excalidraw.com/Virgil.woff2");
+      }
+      @font-face {
+        font-family: "Cascadia";
+        src: url("https://excalidraw.com/Cascadia.woff2");
+      }
+      @font-face {
+        font-family: "Assistant";
+        src: url("https://excalidraw.com/Assistant-Regular.woff2");
+      }
+    </style>
+    
+  </defs>
+  <rect x="0" y="0" width="906.0706309116662" height="579.5833331097583" fill="#ffffff"></rect><g stroke-linecap="round" transform="translate(10 319.1666664430918) rotate(0 103.75000000000011 47.916666666666515)"><path d="M23.96 0 C69.29 0, 114.62 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 33.57, 207.5 43.18, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C140.35 95.83, 97.16 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 58.33, 0 44.78, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#a5d8ff"></path><path d="M23.96 0 C85.82 0, 147.68 0, 183.54 0 M23.96 0 C57.52 0, 91.08 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 37.13, 207.5 50.31, 207.5 71.87 M207.5 23.96 C207.5 34.46, 207.5 44.96, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C127.81 95.83, 72.08 95.83, 23.96 95.83 M183.54 95.83 C124.52 95.83, 65.49 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 56.23, 0 40.59, 0 23.96 M0 71.87 C0 61.43, 0 50.98, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(37.578125000000114 355.0833331097583) rotate(0 76.171875 12)"><text x="76.171875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">AbstractField</text></g><g stroke-linecap="round" transform="translate(349.2853153333332 164.58333277642532) rotate(0 103.75000000000011 47.916666666666515)"><path d="M23.96 0 C74.99 0, 126.03 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 41, 207.5 58.03, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C133.69 95.83, 83.84 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 57.77, 0 43.67, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#a5d8ff"></path><path d="M23.96 0 C66.45 0, 108.94 0, 183.54 0 M23.96 0 C85.28 0, 146.61 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 34.95, 207.5 45.95, 207.5 71.87 M207.5 23.96 C207.5 39.04, 207.5 54.13, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C121.63 95.83, 59.72 95.83, 23.96 95.83 M183.54 95.83 C125.25 95.83, 66.97 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 56.42, 0 40.96, 0 23.96 M0 71.87 C0 57.66, 0 43.45, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(376.8634403333332 200.49999944309184) rotate(0 76.171875 12)"><text x="76.171875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">ConstantField</text></g><g stroke-linecap="round" transform="translate(349.2853153333332 473.7499997764253) rotate(0 103.75000000000011 47.916666666666515)"><path d="M23.96 0 C58.14 0, 92.32 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 35.5, 207.5 47.03, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C140.86 95.83, 98.19 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 56.52, 0 41.17, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M23.96 0 C63.74 0, 103.52 0, 183.54 0 M23.96 0 C82.52 0, 141.08 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 34.37, 207.5 44.78, 207.5 71.87 M207.5 23.96 C207.5 37.93, 207.5 51.9, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C119.79 95.83, 56.05 95.83, 23.96 95.83 M183.54 95.83 C126.8 95.83, 70.06 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 53.18, 0 34.48, 0 23.96 M0 71.87 C0 57.77, 0 43.67, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(376.8634403333332 509.6666664430918) rotate(0 76.171875 12)"><text x="76.171875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">FunctionField</text></g><g stroke-linecap="round" transform="translate(349.28531499999974 319.16666644309225) rotate(0 103.75000000000011 47.916666666666515)"><path d="M23.96 0 C67.37 0, 110.78 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 36.84, 207.5 49.72, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C127.61 95.83, 71.69 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 61.45, 0 51.02, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M23.96 0 C64.06 0, 104.16 0, 183.54 0 M23.96 0 C84.96 0, 145.96 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 34.23, 207.5 44.51, 207.5 71.87 M207.5 23.96 C207.5 35.38, 207.5 46.81, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C142.43 95.83, 101.31 95.83, 23.96 95.83 M183.54 95.83 C130.98 95.83, 78.41 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 60.5, 0 49.12, 0 23.96 M0 71.87 C0 61.33, 0 50.79, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(423.73843999999997 355.08333310975877) rotate(0 29.296875 12)"><text x="29.296875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Field</text></g><g stroke-linecap="round" transform="translate(688.5706303333333 164.583333109759) rotate(0 103.75000000000011 47.916666666666515)"><path d="M23.96 0 C64.91 0, 105.85 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 36.92, 207.5 49.88, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C128 95.83, 72.46 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 57.25, 0 42.63, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M23.96 0 C57.89 0, 91.83 0, 183.54 0 M23.96 0 C72.37 0, 120.79 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 37.96, 207.5 51.96, 207.5 71.87 M207.5 23.96 C207.5 39.13, 207.5 54.3, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C130.45 95.83, 77.36 95.83, 23.96 95.83 M183.54 95.83 C150.28 95.83, 117.02 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 55.32, 0 38.77, 0 23.96 M0 71.87 C0 55.17, 0 38.47, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(745.4456303333333 200.4999997764255) rotate(0 46.875 12)"><text x="46.875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">OneField</text></g><g stroke-linecap="round" transform="translate(688.5706309116661 10) rotate(0 103.75000000000006 47.916666666666515)"><path d="M23.96 0 C60.05 0, 96.15 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 40.98, 207.5 58.01, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C124.11 95.83, 64.68 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 60.14, 0 48.41, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M23.96 0 C77.47 0, 130.99 0, 183.54 0 M23.96 0 C79.84 0, 135.71 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 40.89, 207.5 57.81, 207.5 71.87 M207.5 23.96 C207.5 41.47, 207.5 58.98, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C123.4 95.83, 63.26 95.83, 23.96 95.83 M183.54 95.83 C143.79 95.83, 104.03 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 58.76, 0 45.65, 0 23.96 M0 71.87 C0 61.6, 0 51.32, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(733.726880911666 45.916666666666515) rotate(0 58.59375 12)"><text x="58.59375" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">ValueField</text></g><g stroke-linecap="round" transform="translate(688.5706299999999 319.1666664430925) rotate(0 103.75000000000006 47.91666666666663)"><path d="M23.96 0 C84.72 0, 145.48 0, 183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 C207.5 34.53, 207.5 45.1, 207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 C151.49 95.83, 119.44 95.83, 23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 C0 61.95, 0 52.02, 0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M23.96 0 C60.35 0, 96.73 0, 183.54 0 M23.96 0 C67.8 0, 111.65 0, 183.54 0 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M183.54 0 C199.51 0, 207.5 7.99, 207.5 23.96 M207.5 23.96 C207.5 39.17, 207.5 54.38, 207.5 71.87 M207.5 23.96 C207.5 41.44, 207.5 58.93, 207.5 71.87 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M207.5 71.87 C207.5 87.85, 199.51 95.83, 183.54 95.83 M183.54 95.83 C132.06 95.83, 80.59 95.83, 23.96 95.83 M183.54 95.83 C122.33 95.83, 61.13 95.83, 23.96 95.83 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M23.96 95.83 C7.99 95.83, 0 87.85, 0 71.87 M0 71.87 C0 59.52, 0 47.16, 0 23.96 M0 71.87 C0 54.66, 0 37.44, 0 23.96 M0 23.96 C0 7.99, 7.99 0, 23.96 0 M0 23.96 C0 7.99, 7.99 0, 23.96 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(739.5862549999998 355.083333109759) rotate(0 52.734375 12)"><text x="52.734375" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">ZeroField</text></g><g stroke-linecap="round"><g transform="translate(216.8333326666666 367.0833327764253) rotate(0 39 0)"><path d="M0 0 C13 0, 65 0, 78 0 M0 0 C13 0, 65 0, 78 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(295.2083336666666 202.83333277642532) rotate(0 0 164.25)"><path d="M0 0 C0 54.75, 0 273.75, 0 328.5 M0 0 C0 54.75, 0 273.75, 0 328.5" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(556.7853152421201 212.4999989658288) rotate(0 39 0)"><path d="M0 0 C13 0, 65 0, 78 0 M0 0 C13 0, 65 0, 78 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(635.1603162421202 48.24999896582881) rotate(0 0 164.25)"><path d="M0 0 C0 54.75, 0 273.75, 0 328.5 M0 0 C0 54.75, 0 273.75, 0 328.5" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(295.2006480898708 202.83333338967327) rotate(0 27.042333902095777 0)"><path d="M0 0 C9.01 0, 45.07 0, 54.08 0 M0 0 C9.01 0, 45.07 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(295.2006480898708 202.83333338967327) rotate(0 27.042333902095777 0)"><path d="M54.08 0 L40.49 6.34 L40.49 -6.34 L54.08 0" stroke="none" stroke-width="0" fill="#1e1e1e" fill-rule="evenodd"></path><path d="M54.08 0 C50.84 1.51, 47.59 3.03, 40.49 6.34 M54.08 0 C50.73 1.56, 47.38 3.13, 40.49 6.34 M40.49 6.34 C40.49 3.15, 40.49 -0.03, 40.49 -6.34 M40.49 6.34 C40.49 3.77, 40.49 1.21, 40.49 -6.34 M40.49 -6.34 C43.93 -4.73, 47.37 -3.13, 54.08 0 M40.49 -6.34 C44.1 -4.66, 47.71 -2.97, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(294.8333320386697 367.0833325894764) rotate(0 27.042333902095777 0)"><path d="M0 0 C9.01 0, 45.07 0, 54.08 0 M0 0 C9.01 0, 45.07 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(294.8333320386697 367.0833325894764) rotate(0 27.042333902095777 0)"><path d="M54.08 0 L40.49 6.34 L40.49 -6.34 L54.08 0" stroke="none" stroke-width="0" fill="#1e1e1e" fill-rule="evenodd"></path><path d="M54.08 0 C49.58 2.1, 45.08 4.2, 40.49 6.34 M54.08 0 C51.14 1.37, 48.19 2.75, 40.49 6.34 M40.49 6.34 C40.49 3.54, 40.49 0.75, 40.49 -6.34 M40.49 6.34 C40.49 2.03, 40.49 -2.27, 40.49 -6.34 M40.49 -6.34 C43.42 -4.97, 46.35 -3.6, 54.08 0 M40.49 -6.34 C44.52 -4.46, 48.56 -2.58, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(294.83333146592327 531.333333391733) rotate(0 27.042333902095777 0)"><path d="M0 0 C9.01 0, 45.07 0, 54.08 0 M0 0 C9.01 0, 45.07 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(294.83333146592327 531.333333391733) rotate(0 27.042333902095777 0)"><path d="M54.08 0 L40.49 6.34 L40.49 -6.34 L54.08 0" stroke="none" stroke-width="0" fill="#1e1e1e" fill-rule="evenodd"></path><path d="M54.08 0 C49.99 1.91, 45.89 3.82, 40.49 6.34 M54.08 0 C49.85 1.98, 45.61 3.95, 40.49 6.34 M40.49 6.34 C40.49 1.84, 40.49 -2.66, 40.49 -6.34 M40.49 6.34 C40.49 2.37, 40.49 -1.6, 40.49 -6.34 M40.49 -6.34 C45.64 -3.94, 50.78 -1.54, 54.08 0 M40.49 -6.34 C43.21 -5.07, 45.94 -3.8, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(634.485962696154 376.74999873091474) rotate(0 27.042333902095777 0)"><path d="M0 0 C9.01 0, 45.07 0, 54.08 0 M0 0 C9.01 0, 45.07 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(634.485962696154 376.74999873091474) rotate(0 27.042333902095777 0)"><path d="M54.08 0 L40.49 6.34 L40.49 -6.34 L54.08 0" stroke="none" stroke-width="0" fill="#1e1e1e" fill-rule="evenodd"></path><path d="M54.08 0 C50.67 1.59, 47.25 3.19, 40.49 6.34 M54.08 0 C50.97 1.45, 47.86 2.9, 40.49 6.34 M40.49 6.34 C40.49 2.22, 40.49 -1.9, 40.49 -6.34 M40.49 6.34 C40.49 2.82, 40.49 -0.69, 40.49 -6.34 M40.49 -6.34 C45.22 -4.13, 49.96 -1.93, 54.08 0 M40.49 -6.34 C44.78 -4.34, 49.07 -2.34, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(634.4859622952171 212.49999925665315) rotate(0 27.042333902095777 0)"><path d="M0 0 C9.01 0, 45.07 0, 54.08 0 M0 0 C9.01 0, 45.07 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(634.4859622952171 212.49999925665315) rotate(0 27.042333902095777 0)"><path d="M54.08 0 L40.49 6.34 L40.49 -6.34 L54.08 0" stroke="none" stroke-width="0" fill="#1e1e1e" fill-rule="evenodd"></path><path d="M54.08 0 C49.58 2.1, 45.08 4.2, 40.49 6.34 M54.08 0 C49.9 1.95, 45.71 3.9, 40.49 6.34 M40.49 6.34 C40.49 3.02, 40.49 -0.29, 40.49 -6.34 M40.49 6.34 C40.49 1.7, 40.49 -2.94, 40.49 -6.34 M40.49 -6.34 C45.38 -4.06, 50.28 -1.77, 54.08 0 M40.49 -6.34 C44.91 -4.28, 49.32 -2.22, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(634.4859623295362 48.24999932999549) rotate(0 27.042333902095777 0)"><path d="M0 0 C9.01 0, 45.07 0, 54.08 0 M0 0 C9.01 0, 45.07 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(634.4859623295362 48.24999932999549) rotate(0 27.042333902095777 0)"><path d="M54.08 0 L40.49 6.34 L40.49 -6.34 L54.08 0" stroke="none" stroke-width="0" fill="#1e1e1e" fill-rule="evenodd"></path><path d="M54.08 0 C51.27 1.31, 48.46 2.62, 40.49 6.34 M54.08 0 C50.01 1.9, 45.94 3.8, 40.49 6.34 M40.49 6.34 C40.49 1.52, 40.49 -3.3, 40.49 -6.34 M40.49 6.34 C40.49 3.29, 40.49 0.24, 40.49 -6.34 M40.49 -6.34 C43.91 -4.75, 47.32 -3.15, 54.08 0 M40.49 -6.34 C43.6 -4.89, 46.71 -3.44, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0 M54.08 0 C54.08 0, 54.08 0, 54.08 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(635.4625477829982 532.8462523410176) rotate(0 27.483622170580986 8.456499129409622)"><path d="M0 0 L54.97 0 L54.97 16.91 L0 16.91" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M0 0 C17.19 0, 34.39 0, 54.97 0 M0 0 C14.99 0, 29.98 0, 54.97 0 M54.97 0 C54.97 4.17, 54.97 8.34, 54.97 16.91 M54.97 0 C54.97 6.32, 54.97 12.64, 54.97 16.91 M54.97 16.91 C40.94 16.91, 26.92 16.91, 0 16.91 M54.97 16.91 C36.78 16.91, 18.59 16.91, 0 16.91 M0 16.91 C0 10.53, 0 4.16, 0 0 M0 16.91 C0 13.07, 0 9.24, 0 0" stroke="transparent" stroke-width="2" fill="none"></path></g><g stroke-linecap="round" transform="translate(635.4625476917989 493.57408255675364) rotate(0 27.48362217058093 8.456499129409622)"><path d="M0 0 L54.97 0 L54.97 16.91 L0 16.91" stroke="none" stroke-width="0" fill="#a5d8ff"></path><path d="M0 0 C21.48 0, 42.95 0, 54.97 0 M0 0 C14.4 0, 28.8 0, 54.97 0 M54.97 0 C54.97 5.53, 54.97 11.06, 54.97 16.91 M54.97 0 C54.97 5.95, 54.97 11.89, 54.97 16.91 M54.97 16.91 C33.51 16.91, 12.06 16.91, 0 16.91 M54.97 16.91 C36.89 16.91, 18.82 16.91, 0 16.91 M0 16.91 C0 11.2, 0 5.48, 0 0 M0 16.91 C0 12.74, 0 8.56, 0 0" stroke="transparent" stroke-width="2" fill="none"></path></g><g transform="translate(708.2069489087729 490.0305822093296) rotate(0 76.171875 12)"><text x="76.171875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Abstract Type</text></g><g transform="translate(708.2069488507913 529.3027505607924) rotate(0 76.171875 12)"><text x="76.171875" y="19.3125" font-family="Cascadia, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Concrete Type</text></g><g stroke-linecap="round" transform="translate(611.1645280690291 472.1154563079617) rotate(0 142.45305119471442 48.73393835271304)"><path d="M24.37 0 C111.96 0, 199.55 0, 260.54 0 M260.54 0 C276.78 0, 284.91 8.12, 284.91 24.37 M284.91 24.37 C284.91 34.33, 284.91 44.29, 284.91 73.1 M284.91 73.1 C284.91 89.35, 276.78 97.47, 260.54 97.47 M260.54 97.47 C188.92 97.47, 117.3 97.47, 24.37 97.47 M24.37 97.47 C8.12 97.47, 0 89.35, 0 73.1 M0 73.1 C0 57.79, 0 42.48, 0 24.37 M0 24.37 C0 8.12, 8.12 0, 24.37 0" stroke="#ced4da" stroke-width="2.5" fill="none" stroke-dasharray="8 10"></path></g></svg>
\ No newline at end of file
diff --git a/docs/src/assets/grid_2d.png b/docs/src/assets/grid_2d.png
new file mode 100644
index 00000000..a2b6ec0e
Binary files /dev/null and b/docs/src/assets/grid_2d.png differ
diff --git a/docs/src/assets/grid_3d.png b/docs/src/assets/grid_3d.png
new file mode 100644
index 00000000..7f33a410
Binary files /dev/null and b/docs/src/assets/grid_3d.png differ
diff --git a/docs/src/assets/mixed_bc_example.png b/docs/src/assets/mixed_bc_example.png
new file mode 100644
index 00000000..babe075c
Binary files /dev/null and b/docs/src/assets/mixed_bc_example.png differ
diff --git a/docs/src/assets/staggered_grid.png b/docs/src/assets/staggered_grid.png
new file mode 100644
index 00000000..5643fd74
Binary files /dev/null and b/docs/src/assets/staggered_grid.png differ
diff --git a/docs/src/assets/staggered_grid_cell.png b/docs/src/assets/staggered_grid_cell.png
new file mode 100644
index 00000000..ba0c74ee
Binary files /dev/null and b/docs/src/assets/staggered_grid_cell.png differ
diff --git a/docs/src/concepts/architectures.md b/docs/src/concepts/architectures.md
new file mode 100644
index 00000000..d5ec3e05
--- /dev/null
+++ b/docs/src/concepts/architectures.md
@@ -0,0 +1,42 @@
+# Architectures
+
+## Backend Selection & Architecture Initialization
+
+Chmy.jl supports CPUs, as well as CUDA and ROC backends for Nvidia and AMD GPUs through a thin wrapper around the [`KernelAbstractions.jl`](https://github.com/JuliaGPU/KernelAbstractions.jl) for users to select desirable backends.
+
+```julia
+# Default with CPU
+arch = Arch(CPU())
+```
+
+```julia
+using CUDA
+arch = Arch(CUDABackend())
+```
+
+```julia
+using AMDGPU
+arch = Arch(ROCBackend())
+```
+
+At the beginning of program, one may specify the backend and initialize the architecture they desire to use. The initialized `arch` variable will be required explicitly at creation of some objects such as grids and kernel launchers.
+
+## Specifying the device ID and stream priority
+
+On systems with multiple GPUs, passing the keyword argument `device_id` to the `Arch` constructor will select and set the selected device as a current device.
+
+For advanced users, we provide a function `activate!(arch; priority)` for specifying the stream priority owned by the task one is executing. The stream priority will be set to `:normal` by default, where `:low` and `:high` are also possible options given that the target backend has priority control over streams implemented.
+
+## Distributed Architecture
+
+Our distributed architecture builds upon the abstraction of having GPU clusters that build on the same GPU architecture. Note that in general, GPU clusters may be equipped with hardware from different vendors, incorporating different types of GPUs to exploit their unique capabilities for specific tasks.
+
+To make the `Architecture` object aware of MPI topology, user can pass an MPI communicator object and dimensions of the Cartesian topology to the `Arch` constructor:
+
+```julia
+using MPI
+
+arch = Arch(CPU(), MPI.COMM_WORLD, (0, 0, 0))
+```
+
+Passing zeros as the last argument will automatically spread the dimensions to be as close as possible to each other, see [MPI.jl documentation](https://juliaparallel.org/MPI.jl/stable/reference/topology/#MPI.Dims_create) for details.
diff --git a/docs/src/concepts/bc.md b/docs/src/concepts/bc.md
new file mode 100644
index 00000000..c129da36
--- /dev/null
+++ b/docs/src/concepts/bc.md
@@ -0,0 +1,67 @@
+# Boundary Conditions
+
+Using [Chmy.jl](https://github.com/PTsolvers/Chmy.jl), we aim to study partial differential equations (PDEs) arising from physical or engineering problems. Additional initial and/or boundary conditions are necessary for the model problem to be well-posed, ensuring the existence and uniqueness of a stable solution.
+
+We provide a small overview for boundary conditions that one often encounters. In the following, we consider the unknown function $u : \Omega \mapsto \mathbb{R}$ defined on some  bounded computational domain $\Omega \subset \mathbb{R}^d$ in a $d$-dimensional space. With the domain boundary denoted by $\partial \Omega$, we have some function $g : \partial \Omega \mapsto \mathbb{R}$ prescribed on the boundary.
+
+| Type    | Form | Example |
+|:------------|:------------|:---------|
+| Dirichlet | $u = g$ on $\partial \Omega$ | In fluid dynamics, the no-slip condition for viscous fluids states that at a solid boundary the fluid has zero velocity relative to the boundary. |
+| Neumann | $\partial_{\boldsymbol{n}} u = g$ on $\partial \Omega$, where $\boldsymbol{n}$ is the outer normal vector to $\Omega$ | It specifies the values in which the derivative of a solution is applied within the boundary of the domain. An application in thermodynamics is a prescribed heat flux through the boundary |
+| Robin  |  $u + \alpha \partial_\nu u = g$ on $\partial \Omega$, where $\alpha \in \mathbb{R}$.  | Also called impedance boundary conditions from their application in electromagnetic problems |
+
+## Applying Boundary Conditions with `bc!()`
+
+In the following, we describe the syntax in [Chmy.jl](https://github.com/PTsolvers/Chmy.jl) for launching kernels that impose boundary conditions on some `field` that is well-defined on a `grid` with backend specified through `arch`.
+
+For Dirichlet and Neumann boundary conditions, they are referred to as homogeneous if $g = 0$, otherwise they are non-homogeneous if $g = v$ holds, for some $v\in \mathbb{R}$.
+
+|     | Homogeneous | Non-homogeneous |
+|:------------|:------------|:------------|
+| Dirichlet on $\partial \Omega$ | `bc!(arch, grid, field => Dirichlet())` | `bc!(arch, grid, field => Dirichlet(v))` |
+| Neumann on $\partial \Omega$ | `bc!(arch, grid, field => Neumann())` | `bc!(arch, grid, field => Neumann(v))` |
+
+Note that the syntax shown in the table above is a **fused expression** of both _specifying_ and _applying_ the boundary conditions.
+
+!!! warning "$\partial \Omega$ Refers to the Entire Domain Boundary!"
+    By specifying `field` to a single boundary condition, we impose the boundary condition on the entire domain boundary by default. See the section for "Mixed Boundary Conditions" below for specifying different BC on different parts of the domain boundary.
+
+Alternatively, one could also define the boundary conditions beforehand using `batch()` provided the `grid` information as well as the `field` variable. This way the boundary condition to be prescibed is **precomputed**.
+
+```julia
+# pre-compute batch
+bt = batch(grid, field => Neumann()) # specify Neumann BC for the variable `field`
+bc!(arch, grid, bt)                  # apply the boundary condition
+```
+
+In the script [batcher.jl](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/batcher.jl), we provide a MWE using both **fused** and **precomputed** expressions for BC update.
+
+## Specifying BC within a `launch`
+
+When using `launch` to specify the execution of a kernel (more see section [Kernels](./kernels.md)), one can pass the specified boundary condition(s) as an optional parameter using `batch`, provided the grid information of the discretized space. This way we can gain efficiency from making good use of already cached values.
+
+In the 2D diffusion example as introduced in the tutorial ["Getting Started with Chmy.jl"](../getting_started.md), we need to update the temperature field `C` at k-th iteration using the values of heat flux `q` and physical time step size `Δt` from (k-1)-th iteration. When launching the kernel `update_C!` with `launch`, we simultaneously launch the kernel for the BC update using:
+
+```julia
+launch(arch, grid, update_C! => (C, q, Δt, grid); bc=batch(grid, C => Neumann(); exchange=C))
+```
+
+### Mixed Boundary Conditions
+
+In the code example above, by specifying boundary conditions using syntax such as `field => Neumann()`, we essentially launch a kernel that impose the Neumann boundary condition on the entire domain boundary $\partial \Omega$. More often, one may be interested in prescribing different boundary conditions on different parts of $\partial \Omega$.
+
+The following figure showcases a 2D square domain $\Omega$ with different boundary conditions applied on each side:
+
+- The top boundary (red) is a Dirichlet boundary condition where $u = a$.
+- The bottom boundary (blue) is also a Dirichlet boundary condition where $u = b$.
+- The left and right boundaries (green) are Neumann boundary conditions where $\frac{\partial u}{\partial y} = 0$.
+
+```@raw html
+<img src="../assets/mixed_bc_example.png" width="60%"/>
+```
+
+To launch a kernel that satisfies these boundary conditions in Chmy.jl, you can use the following code:
+
+```julia
+bc!(arch, grid, field => (x = Neumann(), y = (Dirichlet(b), Dirichlet(a))))
+```
diff --git a/docs/src/concepts/fields.md b/docs/src/concepts/fields.md
new file mode 100644
index 00000000..a3c27d85
--- /dev/null
+++ b/docs/src/concepts/fields.md
@@ -0,0 +1,138 @@
+# Fields
+
+With a given grid that allows us to define each point uniquely in a high-dimensional space, we abstract the data values to be defined on the grid under the concept `AbstractField`. Following is the type tree of the abstract field and its derived data types.
+
+```@raw html
+<img src="../assets/field_type_tree.svg" width="70%"/>
+```
+
+## Defining a multi-dimensional `Field`
+
+Consider the following example, where we defined a variable `grid` of type `Chmy.UniformGrid`, similar as in the previous section [Grids](./grids.md). We can now define physical properties on the grid.
+
+When defining a scalar field `Field` on the grid, we need to specify the arrangement of the field values. These values can either be stored at the cell centers of each control volume `Center()` or on the cell vertices/faces `Vertex()`.
+
+```julia
+# Define geometry, architecture..., a 2D grid
+grid = UniformGrid(arch; origin=(-lx/2, -ly/2), extent=(lx, ly), dims=(nx, ny))
+
+# Define pressure as a scalar field
+Pr = Field(backend, grid, Center())
+```
+
+With the methods `VectorField` and `TensorField`, we can construct 2-dimensional and 3-dimensional fields, with predefined locations for each field dimension on a staggered grid.
+
+```julia
+# Define velocity as a vector field on the 2D grid
+V = VectorField(backend, grid)
+
+# Define stress as a tensor field on the 2D grid
+τ = TensorField(backend, grid)
+```
+
+Use the function `location` to get the location of the field as a tuple. Vector and tensor fields are currently defined as `NamedTuple`'s (likely to change in the future), so one could query the locations of individual components, e.g. `location(V.x)` or `location(τ.xy)`
+
+!!! tip "Acquiring Locations on the Grid Cell"
+    One could use a convenient getter for obtaining locations of variable on the staggered-grid. Such as `Chmy.location(Pr)` for scalar-valued pressure field and `Chmy.location(τ.xx)` for a tensor field.
+
+### Initialising `Field`
+
+Chmy.jl provides functionality to set the values of the fields as a function of spatial coordinates:
+
+```julia
+C = Field(backend, grid, Center())
+
+# Set initial values of the field randomly
+set!(C, grid, (_, _) -> rand())
+
+# Set initial values to 2D Gaussian
+set!(C, grid, (x, y) -> exp(-x^2 - y^2))
+```
+
+```@raw html
+<img src="../assets/field_set_ic_random.png" width="50%"/>
+```
+
+```@raw html
+<img src="../assets/field_set_ic_gaussian.png" width="50%"/>
+```
+
+A slightly more complex usage involves passing extra parameters to be used for initial conditions setup.
+
+```julia
+# Define a temperature field with values on cell centers
+T = Field(backend, grid, Center())
+
+# Function for setting up the initial conditions on T
+init_incl(x, y, x0, y0, r, in, out) = ifelse((x - x0)^2 + (y - y0)^2 < r^2, in, out)
+
+# Set up the initial conditions with parameters specified
+set!(T, grid, init_incl; parameters=(x0=0.0, y0=0.0, r=0.1lx, in=T0, out=Ta))
+```
+
+## Defining a parameterized `FunctionField`
+
+A field could also be represented in a parameterized way, having a function that associates a single number to every point in the space.
+
+An object of the concrete type `FunctionField` can be initialized with its constructor. The constructor takes in 
+
+1. A function `func`
+2. A `grid`
+3. A location tuple `loc` for specifying the distribution of variables
+
+Optionally, one can also use the boolean variable `discrete` to indicate if the function field is typed `Discrete` or `Continuous`. Any additional parameters to be used in the function `func` can be passed to the optional parameter `parameters`.
+
+### Example: Creation of a parameterized function field
+Followingly, we create a `gravity` variable that is two-dimensional and comprises of two parameterized `FunctionField` objects on a predefined uniform grid `grid`.
+
+**1. Define Functions that Parameterize the Field**
+
+In this step, we specify how the gravity field should be parameterized in x-direction and y-direction, with `η` as the additional parameter used in the parameterization.
+
+```julia
+# forcing terms
+ρgx(x, y, η) = -0.5 * η * π^2 * sin(0.5π * x) * cos(0.5π * y)
+ρgy(x, y, η) = 0.5 * η * π^2 * cos(0.5π * x) * sin(0.5π * y)
+```
+
+**2. Define Locations for Variable Positioning**
+
+We specify the location on the fully-staggered grid as introduced in the _Location on a Grid Cell_ section of the concept [Grids](./grids.md).
+
+```julia
+vx_node = (Vertex(), Center())
+vy_node = (Center(), Vertex())
+```
+
+**3. Define the 2D Gravity Field**
+
+By specifying the locations on which the parameterized field should be calculated, as well as concretizing the value `η = η0` by passing it as the optional parameter `parameters` to the constructor, we can define the 2D gravity field:
+
+```julia
+η0 = 1.0
+gravity = (x=FunctionField(ρgx, grid, vx_node; parameters=η0),
+           y=FunctionField(ρgy, grid, vy_node; parameters=η0))
+```
+
+## Defining Constant Fields
+
+For completeness, we also provide an abstract type `ConstantField`, which comprises of a generic `ValueField` type, and two special types `ZeroField`, `OneField` allowing dispatch for special casess. With such a construct, we can easily define value fields properties and other parameters using constant values in a straightforward and readable manner. Moreover, explicit information about the grid on which the field should be defined can be abbreviated. For example:
+
+```julia
+# Defines a field with constant values 1.0
+field = Chmy.ValueField(1.0)
+```
+
+Alternatively, we could also use the `OneField` type, providing type information about the contents of the field.
+
+```julia
+# Defines a field with constant value 1.0
+onefield = Chmy.OneField{Float64}()
+```
+
+Notably, these two fields shall equal to each other as expected.
+
+```julia
+julia> field == onefield
+true
+```
diff --git a/docs/src/concepts/grid_operators.md b/docs/src/concepts/grid_operators.md
new file mode 100644
index 00000000..bf60cddd
--- /dev/null
+++ b/docs/src/concepts/grid_operators.md
@@ -0,0 +1,107 @@
+# Grid Operators
+
+Chmy.jl currently supports various finite difference operators for fields defined in Cartesian coordinates. The table below summarizes the most common usage of grid operators, with the grid `g::StructuredGrid` and index `I = @index(Global, Cartesian)` defined and `P = Field(backend, grid, location)` is some field defined on the grid `g`.
+
+| Mathematical Formulation | Code |
+|:-------|:------------|
+| $\frac{\partial}{\partial x} P$ | ` ∂x(P, g, I)` |
+| $\frac{\partial}{\partial y} P$ | ` ∂y(P, g, I)` |
+| $\frac{\partial}{\partial z} P$ | ` ∂z(P, g, I)` |
+| $\nabla P$ | ` divg(P, g, I)` |
+
+## Computing the Divergence of a Vector Field
+
+To illustrate the usage of grid operators, we compute the divergence of an vector field $V$ using the `divg` function. We first allocate memory for required fields.
+
+```julia
+V  = VectorField(backend, grid)
+∇V = Field(backend, grid, Center())
+# use set! to set up the initial vector field...
+```
+
+The kernel that computes the divergence needs to have the grid information passed as for other finite difference operators.
+
+```julia
+@kernel inbounds = true function update_∇!(V, ∇V, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    ∇V[I] = divg(V, g, I)
+end
+```
+
+The kernel can then be launched when required as we detailed in section [Kernels](./kernels.md).
+
+```julia
+launch(arch, grid, update_∇! => (V, ∇V, grid))
+```
+
+## Masking
+
+Masking allows selectively applying operations only where needed, allowing more flexible control over the grid operators and improving performance. Thus, by providing masked grid operators, we enable more flexible control over the domain on which the grid operators should be applied.
+
+In the following example, we first define a mask `ω` on the 2D `StructuredGrid`. Then we specify to **not** mask the center area of all Vx, Vy nodes (accessible through `ω.vc`, `ω.cv`) on the staggered grid.
+
+```julia
+# define the mask
+ω = FieldMask2D(arch, grid) # with backend and grid geometry defined
+
+# define the initial inclusion
+r = 2.0
+init_inclusion = (x,y) -> ifelse(x^2 + y^2 < r^2, 1.0, 0.0)
+
+# mask all other entries other than the initial inclusion 
+set!(ω.vc, grid, init_inclusion)
+set!(ω.cv, grid, init_inclusion)
+```
+
+We can then pass the mask to other grid operators when applying them within the kernel. When computing masked derivatives, a mask being the subtype of `AbstractMask` is premultiplied at the corresponding grid location for each operand:
+
+```julia
+@kernel function update_strain_rate!(ε̇, V, ω::AbstractMask, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    # with masks ω
+    ε̇.xx[I] = ∂x(V.x, ω, g, I)
+    ε̇.yy[I] = ∂y(V.y, ω, g, I)
+    ε̇.xy[I] = 0.5 * (∂y(V.x, ω, g, I) + ∂x(V.y, ω, g, I))
+end
+```
+
+The kernel can be launched as follows, with some launcher defined using `launch = Launcher(arch, grid)`:
+
+```julia
+# define fields
+ε̇ = TensorField(backend, grid)
+V = VectorField(backend, grid)
+
+# launch kernel
+launch(arch, grid, update_strain_rate! => (ε̇, V, ω, grid))
+```
+
+## Interpolation
+
+Interpolating physical parameters such as permeability and density between various grid locations is frequently necessary on a staggered grid. Chmy.jl provides functions for performing interpolations of field values between different locations.
+
+In the following example, we specify to use the linear interpolation rule `lerp` when interpolating nodal values of the density field `ρ`, defined on pressure nodes (with location `(Center(), Center())`) to `ρvx` and `ρvy`, defined on Vx and Vy nodes respectively.
+
+```julia
+# define density ρ on pressure nodes
+ρ   = Field(backend, grid, Center())
+ρ0  = 3.0; set!(ρ, ρ0)
+
+# allocate memory for density on Vx, Vy nodes
+ρvx = Field(backend, grid, (Vertex(), Center()))
+ρvy = Field(backend, grid, (Center(), Vertex()))
+```
+
+The kernel `interpolate_ρ!` performs the actual interpolation of nodal values and requires the grid information passed by `g`.
+
+```julia
+@kernel function interpolate_ρ!(ρ, ρvx, ρvy, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    # Interpolate from pressure nodes to Vx, Vy nodes
+    ρvx = lerp(ρ, location(ρvx), g, I)
+    ρvy = lerp(ρ, location(ρvy), g, I)
+end
+```
diff --git a/docs/src/concepts/grids.md b/docs/src/concepts/grids.md
new file mode 100644
index 00000000..37d1f617
--- /dev/null
+++ b/docs/src/concepts/grids.md
@@ -0,0 +1,101 @@
+# Grids
+
+The choice of numerical grid used depends on the type of equations to be resolved and affects the discretization schemes used. The design of the `Chmy.Grids` module aims to provide a robust yet flexible user API in customizing the numerical grids used for spatial discretization.
+
+We currently support grids with quadrilateral cells. An `N`-dimensional numerical grid contains N spatial dimensions, each represented by an axis.
+
+| Grid Properties | Description |Tunable Parameters |
+|:-------|:------------|:------------|
+| Dimensions | The grid can be N-dimensional by having N axes. | `AbstractAxis` |
+| Distribution of Nodal Points | The grid can be regular (uniform distribution) or non-regular (irregular distribution). | `UniformAxis`, `FunctionAxis` |
+| Distribution of Variables | The grid can be non-staggered (collocated) or staggered, affecting how variables are positioned within the grid. | `Center`, `Vertex` |
+
+## Axis
+
+Objects of type `AbstractAxis` are building blocks of numerical grids. We can either define equidistant axes with `UniformAxis`, or parameterized axes with `FunctionAxis`.
+
+### Uniform Axis
+
+To define a uniform axis, we need to provide:
+
+- `Origin`: The starting point of the axis.
+- `Extent`: The length of the section of the axis considered.
+- `Cell Length`: The length of each cell along the axis.
+
+With the information above, an axis can be defined and incorporated into a spatial dimension. The `spacing` (with alias `Δ`) and `inv_spacing` (with alias `iΔ`) functions allow convenient access to the grid spacing (`Δx`/`Δy`/`Δz`) and its reciprocal, respectively.
+
+### Function Axis
+
+As an alternative, one could also define a `FunctionAxis` object using a function that parameterizes the spacing of the axis, together with the length of the axis.
+
+```julia
+f = i -> ((i - 1) / 4)^1.5
+length = 4
+parameterized_axis = FunctionAxis(f, length)
+```
+
+## Structured Grids
+
+A common mesh structure that is used for the spatial discretization in the finite difference approach is a structured grid (concrete type `StructuredGrid` or its alias `SG`).
+
+We provide a function `UniformGrid` for creating an equidistant `StructuredGrid`, that essentially boils down to having axes of type `UniformAxis` in each spatial dimension.
+
+```julia
+# with architecture as well as numerics lx/y/z and nx/y/z defined
+grid   = UniformGrid(arch; 
+                    origin=(-lx/2, -ly/2, -lz/2), 
+                    extent=(lx, ly, lz), 
+                    dims=(nx, ny, nz))
+```
+
+!!! info "Interactive Grid Visualization" 
+    - [grids_2d.jl](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl):  Visualization of a 2D `StructuredGrid`
+    -  [grids_3d.jl](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl):  Visualization of a 3D `StructuredGrid`
+
+```@raw html
+<img src="../assets/grid_2d.png" width="50%"/>
+```
+
+```@raw html
+<img src="../assets/grid_3d.png" width="50%"/>
+```
+
+## Location on a Grid Cell
+
+In order to allow full control over the distribution of different variables on the grid, we provide a high-level abstraction of the property location on a grid cell with the abstract type `Location`. More concretely, a property location along a spatial dimension can be either of concrete type `Center` or `Vertex` on a structured grid.
+
+We illustrate how to specify the location within a grid cell on a fully staggered uniform grid. The following 2D example also has ghost nodes illustrated that are located immediately outside the domain boundary.
+
+```@raw html
+<img src="../assets/staggered_grid.png" width="80%"/>
+```
+
+In the following example, we zoom into a specific cell on a **fully-staggered grid**. By specifying for both x- and y-dimensions whether the node locates at the `Center` (C) or `Vertex` (V) along the respective axis, we can arrive in 4 categories of nodes on a 2D quadrilateral cell, which we refer to as "basic", "pressure", "Vx" and "Vy" nodes, following common practices.
+
+```@raw html
+<img src="../assets/staggered_grid_cell.png" width="50%"/>
+```
+
+If all variables are defined on basic nodes, specified by `(V,V)` locations, we have the simplest non-staggered **collocated grid**.
+
+## Dimensions of Fields on Structured Grids
+
+With a structured grid defined that consists of `nx = N` cells horizontally and `ny = M` cells vertically, we have the following dimensions for fields associated with the grid.
+
+| Node Type | Field Dimension | Location |
+|:----------|:----------------|:---------|
+| Cell vertex | $(N + 1) \times (M + 1)$ | `(V, V)` |
+| X interface | $(N + 1) \times M$       | `(V, C)` |
+| Y interface | $ N \times (M + 1)$      | `(C, V)` |
+| Cell Center | $N \times M$             | `(C, C)` |
+
+## Connectivity of a `StructuredGrid`
+
+Using the method `connectivity(::SG{N,T,C}, ::Dim{D}, ::Side{S})`, one can obtain the connectivity underlying a structured grid. If no special grid topology is provided, a default `Bounded` grid topology is used for the `UniformGrid`. Therefore, on a default `UniformGrid`, the following assertions hold:
+
+```julia
+julia> @assert connectivity(grid, Dim(1), Side(1)) isa Bounded "Left boundary is bounded"
+julia> @assert connectivity(grid, Dim(1), Side(2)) isa Bounded "Right boundary is bounded"
+julia> @assert connectivity(grid, Dim(2), Side(1)) isa Bounded "Upper boundary is bounded"
+julia> @assert connectivity(grid, Dim(2), Side(2)) isa Bounded "Lower boundary is bounded"
+```
diff --git a/docs/src/concepts/kernels.md b/docs/src/concepts/kernels.md
new file mode 100644
index 00000000..8dd532df
--- /dev/null
+++ b/docs/src/concepts/kernels.md
@@ -0,0 +1,122 @@
+# Kernels
+
+The [KernelAbstactions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) package provides a macro-based dialect that hides the intricacies of vendor-specific GPU programming. It allows one to write hardware-agnostic kernels that can be instantiated and launched for different device backends without modifying the high-level code nor sacrificing performance.
+
+In the following, we show how to write and launch kernels on various backends. We also explain the concept of a `Launcher` in [Chmy.jl](https://github.com/PTsolvers/Chmy.jl), that complements the default kernel launching, allowing us to hide the latency between the bulk of the computations and boundary conditions or MPI communications.
+
+## Writing Kernels
+
+This section highlights some important features of [KernelAbstactions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) that are essential for understanding the high-level abstraction of the kernel concept that is used throughout our package. As it barely serves for illustrative purposes, for more specific examples, please refer to their [documentation](https://juliagpu.github.io/KernelAbstractions.jl/stable/).
+
+```julia
+using KernelAbstactions
+
+# Define a kernel that performs element-wise operations on A
+@kernel function mul2(A)
+    # use @index macro to obtain the global Cartesian index of the current work item.
+    I = @index(Global, Cartesian)
+    A[I] *= 2
+end
+```
+
+With the kernel `mul2` as defined using `@kernel` macro, we can launch it on the desired backend to perform the element-wise operations on host.
+
+```julia
+# Define array and work group size
+workgroup_size = 64
+A              = ones(1024, 1024)
+backend        = get_backend(A) # CPU
+
+# Launch kernel and explicitly synchronize
+mul2(backend, workgroup_size)(A, ndrange=size(A))
+synchronize(backend)
+
+# Result assertion
+@assert(all(A .== 2.0) == true)
+```
+
+To launch the kernel on GPU devices, one could simply define `A` as `CuArray`, `ROCArray` or `oneArray` as detailed in the section ["launching kernel on the backend"](https://juliagpu.github.io/KernelAbstractions.jl/stable/quickstart/#Launching-kernel-on-the-backend). More fine-grained memory access is available using the `@index` macro as described [here](https://juliagpu.github.io/KernelAbstractions.jl/stable/api/#KernelAbstractions.@index).
+
+### Thread Indexing
+
+Thread indexing is essential for memory usage on GPU devices; however, it can quickly become cumbersome to figure out the thread index, especially when working with multi-dimensional grids of multi-dimensional blocks of threads. The performance of kernels can also depend significantly on access patterns.
+
+In the example above, we saw the usage of `I = @index(Global, Cartesian)`, which retrieves the global index of threads for the two-dimensional array `A`. Such powerful macros are provided by [KernelAbstactions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) for conveniently retrieving the desired index of threads.
+
+The following table is non-exhaustive and provides a reference of commonly used terminology. Here, [`KernelAbstractions.@index`](https://juliagpu.github.io/KernelAbstractions.jl/stable/api/#KernelAbstractions.@index) is used for index retrieval, and [`KernelAbstractions.@groupsize`](https://juliagpu.github.io/KernelAbstractions.jl/stable/api/#KernelAbstractions.@groupsize) is used for obtaining the dimensions of blocks of threads.
+
+| KernelAbstractions                | CPU                     | CUDA                            |
+|-----------------------------------|-------------------------|---------------------------------|
+| `@index(Local, Linear)`           | `mod(i, g)`             | `threadIdx().x`                 |
+| `@index(Local, Cartesian)[2]`     |                         | `threadIdx().y`                 |
+| `@index(Group, Linear)`           | `i ÷ g`                 | `blockIdx().x`                  |
+| `@index(Group, Cartesian)[2]`     |                         | `blockIdx().y`                  |
+| `@groupsize()[3]`                 |                         | `blockDim().z`                  |
+| `prod(@groupsize())`              | `g`                     | `.x * .y * .z`                  |
+| `@index(Global, Linear)`          | `i`                     | global index computation needed |
+| `@index(Global, Cartesian)[2]`    |                         | global index computation needed |
+| `@index(Global, NTuple)`          |                         | `(threadIdx().x, ... )`         |
+
+
+The `@index(Global, NTuple)` returns a `NTuple` object, allowing more fine-grained memory control over the allocated arrays.
+
+```julia
+@kernel function memcpy!(a, b)
+    i, j = @index(Global, NTuple)
+    @inbounds a[i, j] = b[i, j]
+end
+```
+
+A tuple can be splatted with [`...`](https://docs.julialang.org/en/v1/manual/faq/#What-does-the-...-operator-do?) Julia operator when used to avoid explicitly using `i`, `j` indices.
+
+```julia
+@kernel function splatting_memcpy!(a, b)
+    I = @index(Global, NTuple)
+    @inbounds a[I...] = b[I...]
+end
+```
+
+## Kernel Launcher
+
+In [Chmy.jl](https://github.com/PTsolvers/Chmy.jl), the `KernelLaunch` module is designed to provide handy utilities for performing different grid operations on selected data entries of `Field`s that are involved at each kernel launch, in which the grid geometry underneath is also taken into account.
+
+Followingly, we define a kernel launcher associated with an `UniformGrid` object, supporting CUDA backend.
+
+```julia
+# Define backend and geometry
+arch   = Arch(CUDABackend())
+grid   = UniformGrid(arch; origin=(-1, -1), extent=(2, 2), dims=(126, 126))
+
+# Define launcher
+launch = Launcher(arch, grid)
+```
+
+We also have two kernel functions `compute_q!` and `update_C!` defined, which shall update the fields `q` and `C` using grid operators (see section [Grid Operators](./grid_operators.md)) `∂x`, `∂y`, `divg` that are anchored on some grid `g` accordingly.
+
+```julia
+@kernel inbounds = true function compute_q!(q, C, χ, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    q.x[I] = -χ * ∂x(C, g, I)
+    q.y[I] = -χ * ∂y(C, g, I)
+end
+
+@kernel inbounds = true function update_C!(C, q, Δt, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    C[I] -= Δt * divg(q, g, I)
+end
+```
+
+To spawn the kernel, we invoke the launcher using the `launch` function to perform the field update at each physical timestep, and specify desired boundary conditions for involved fields in the kernel.
+
+```julia
+# Define physics, numerics, geometry ...
+for it in 1:nt
+    # without boundary conditions
+    launch(arch, grid, compute_q! => (q, C, χ, grid))
+
+    # with Neumann boundary conditions and MPI exchange
+    launch(arch, grid, update_C! => (C, q, Δt, grid); bc=batch(grid, C => Neumann(); exchange=C))
+end
+```
diff --git a/docs/src/usage/runtests.md b/docs/src/developer_documentation/running_tests.md
similarity index 97%
rename from docs/src/usage/runtests.md
rename to docs/src/developer_documentation/running_tests.md
index fc029f53..b221ea54 100644
--- a/docs/src/usage/runtests.md
+++ b/docs/src/developer_documentation/running_tests.md
@@ -1,4 +1,4 @@
-# Running tests
+# Running Tests
 
 ## CPU tests
 
diff --git a/docs/src/developer_documentation/workers.md b/docs/src/developer_documentation/workers.md
new file mode 100644
index 00000000..940952c0
--- /dev/null
+++ b/docs/src/developer_documentation/workers.md
@@ -0,0 +1,9 @@
+# Workers
+
+Task-based parallelism provides a highly abstract view for the program execution scheduling, although it may come with a performance overhead related to task creation and destruction. The overhead is currently significant when tasks are used to perform asynchronous operations on GPUs, where TLS context creation and destruction may be in the order of kernel execution time. Therefore, it may be desirable to have long-running tasks that do not get terminated immediately, but only when all queued subtasks (i.e. work units) are executed.
+
+In [Chmy.jl](https://github.com/PTsolvers/Chmy.jl), we introduced the concept `Worker` for this purpose. A `Worker` is a special construct to extend the lifespan of a task created by `Threads.@spawn`. It possesses a `Channel` of subtasks to be executed on the current thread, where subtasks are submitted at construction time to the worker using `put!`.
+
+With the help of the `Worker`, we can specify subtasks that need to be sequentially executed by enqueuing them to one `Worker`. Any work units that shall run in parallel should be put into separate workers instead.
+
+Currently, we use Workers under the hood to hide the communications between computations. We split the computational domain into inner part, containing the bulk of the grid points, and thin outer part. We launch the same kernels processing the inner and outer parts in different Julia tasks. When the outer part completes, we launch the non-blocking MPI communication. Workers are a stateful representation of the long-running computation, needed to avoid significant overhead of creating a new task-local state each time a communication is performed.
diff --git a/docs/src/examples/overview.md b/docs/src/examples/overview.md
new file mode 100644
index 00000000..281baf27
--- /dev/null
+++ b/docs/src/examples/overview.md
@@ -0,0 +1,15 @@
+# Examples Overview
+
+This page provides an overview of [Chmy.jl](https://github.com/PTsolvers/Chmy.jl) examples. These selected examples demonstrate how [Chmy.jl](https://github.com/PTsolvers/Chmy.jl) can be used to solve various numerical problems using architecture-agnostic kernels both on a single-device and in a distributed way.
+
+## Table of Contents
+
+| Example    | Description | 
+|:------------|:------------|
+| [Diffusion 2D](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d.jl) | Solving the 2D diffusion equation on an uniform grid. |
+| [Diffusion 2D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_mpi.jl) | Solving the 2D diffusion equation on an uniform grid distributedly using MPI. |
+| [Single-Device Performance Optimization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d_perf.jl) | Revisiting the 2D diffusion problem with focus on performance optimization techniques on a single-device architecture | 
+| [Stokes 2D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/stokes_2d_inc_ve_T_mpi.jl) | Solving the 2D Stokes equation with thermal coupling on an uniform grid. | 
+| [Stokes 3D with MPI](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/stokes_3d_inc_ve_T_mpi.jl) | Solving the 3D Stokes equation with thermal coupling on an uniform grid distributedly using MPI. | 
+| [2D Grid Visualization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_2d.jl) | Visualization of a 2D `StructuredGrid`. | 
+| [3D Grid Visualization](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/grids_3d.jl) | Visualization of a 3D `StructuredGrid` | 
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
new file mode 100644
index 00000000..8ec5874d
--- /dev/null
+++ b/docs/src/getting_started.md
@@ -0,0 +1,147 @@
+# Getting Started with Chmy.jl
+
+[Chmy.jl](https://github.com/PTsolvers/Chmy.jl) is a backend-agnostic toolkit for finite difference computations on multi-dimensional computational staggered grids. In this introductory tutorial, we will showcase the essence of Chmy.jl by solving a simple 2D diffusion problem. The full code of the tutorial material is available under [diffusion_2d.jl](https://github.com/PTsolvers/Chmy.jl/blob/main/examples/diffusion_2d.jl).
+
+## Basic Diffusion
+
+The diffusion equation is a second order parabolic PDE, here for a multivariable function $C(x,y,t)$ that represents the field being diffused (such as the temperature or the concentration of a chemical component in a solution) showing derivatives in both temporal $\partial t$ and spatial $\partial x$ dimensions, where $\chi$ is the diffusion coefficient. In 2D we have the following formulation for the diffusion process:
+
+```math
+\begin{equation}
+\frac{\partial C}{\partial t} = \chi \left( \frac{\partial^2 C}{\partial x^2} + \frac{\partial^2 C}{\partial y^2} \right).
+\end{equation}
+```
+
+Introducing the diffusion flux $q$, we can rewrite equation `(1)` as a system of two PDEs, consisting of equations `(2)` and `(3)`.
+
+```math
+\begin{equation}
+\boldsymbol{q} = -\chi \nabla C~,
+\end{equation}
+```
+
+```math
+\begin{equation}
+\frac{\partial C}{\partial t} = - \nabla \cdot \boldsymbol{q}~.
+\end{equation}
+```
+
+### Boundary Conditions
+
+Generally, partial differential equations (PDEs) require initial or [boundary conditions](./concepts/bc.md) to ensure a unique and stable solution. For the field `C`, a Neumann boundary condition is given by:
+
+```math
+\begin{equation}
+\frac{\partial C}{\partial \boldsymbol{n}} = g(x, y, t)
+\end{equation}
+```
+where $\frac{\partial C}{\partial \boldsymbol{n}}$ is the derivative of `C` normal to the boundary, and $g(x, y, t)$ is a given function. In this tutorial example, we consider a homogeneous Neumann boundary condition, $g(x, y, t) = 0$, which implies that there is no flux across the boundary.
+
+## Using Chmy.jl for Backend Portable Implementation
+
+As the first step, we need to load the main module and any necessary submodules of [Chmy.jl](https://github.com/PTsolvers/Chmy.jl). Moreover, we use [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) for writing backend-agnostic kernels that are compatible with Chmy.jl.
+
+```julia
+using Chmy, Chmy.Architectures, Chmy.Grids, Chmy.Fields, Chmy.BoundaryConditions, Chmy.GridOperators, Chmy.KernelLaunch
+using KernelAbstractions # for backend-agnostic kernels
+using Printf, CairoMakie # for I/O and plotting
+# using CUDA
+# using AMDGPU
+```
+
+In this introductory tutorial, we will use the CPU backend for simplicity:
+
+```julia
+arch = Arch(CPU())
+```
+
+If a different backend is desired, one needs to load the relevant package accordingly. For example, if Nvidia or AMD GPUs are available, one can comment out `using CUDA` or `using AMDGPU` and make sure to use `arch = Arch(CUDABackend())` or `arch = Arch(ROCBackend())`, respectively, when selecting the architecture. For further information about executing on a single-device or multi-device architecture, see the documentation section for [Architectures](./concepts/architectures.md)
+
+## Writing & Launch Compute Kernels
+
+We want to solve the system of equations `(2)` & `(3)` numerically. We will use the explicit forward [Euler method](https://en.wikipedia.org/wiki/Euler_method) for temporal discretization and [finite-differences](https://en.wikipedia.org/wiki/Finite_difference) for spatial discretization. Accordingly, the kernels for performing the arithmetic operations for each time step can be defined as follows:
+
+```julia
+@kernel inbounds = true function compute_q!(q, C, χ, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    q.x[I] = -χ * ∂x(C, g, I)
+    q.y[I] = -χ * ∂y(C, g, I)
+end
+```
+
+```julia
+@kernel inbounds = true function update_C!(C, q, Δt, g::StructuredGrid, O)
+    I = @index(Global, Cartesian)
+    I = I + O
+    C[I] -= Δt * divg(q, g, I)
+end
+```
+
+!!! note "Non-Cartesian indices"
+    You can use not only `Cartesian` indices, but more usual "subscript" indices as well. For example, `update_C!` will become:
+    ```julia
+    @kernel inbounds = true function update_C!(C, q, Δt, g::StructuredGrid, O)
+        ix, iy = @index(Global, NTuple)
+        (ix, iy) = (ix, iy) + O
+        C[ix, iy] -= Δt * divg(q, g, ix, iy)
+    end
+    ```
+
+## Model Setup
+
+The diffusion model that we solve should contain the following model setup
+
+```julia
+# geometry
+grid   = UniformGrid(arch; origin=(-1, -1), extent=(2, 2), dims=(126, 126))
+launch = Launcher(arch, grid)
+# physics
+χ = 1.0
+# numerics
+Δt = minimum(spacing(grid))^2 / χ / ndims(grid) / 2.1
+```
+
+In the 2D problem only three physical fields, the field `C` and the diffusion flux `q` in `x`- and `y`-dimension are evolving with time. We define these fields on different locations on the staggered grid (more see [Grids](./concepts/grids.md)).
+
+```julia
+# allocate fields
+C = Field(backend, grid, Center())
+q = VectorField(backend, grid)
+```
+
+We randomly initialized the entries of `C` field and finished the initial model setup. One can refer to the section [Fields](./concepts/fields.md) for setting up more complex initial conditions.
+
+```julia
+# initial conditions
+set!(C, grid, (_, _) -> rand())
+bc!(arch, grid, C => Neumann(); exchange=C)
+```
+
+```@raw html
+<div style="text-align: center;">
+    <img src="assets/field_set_ic_random.png" width="50%"/>
+</div>
+```
+
+## Solving Time-dependent Problem
+
+We are resolving a time-dependent problem, so we explicitly advance our solution within a time loop, specifying the number of iterations (or time steps) we desire to perform. The action that takes place within the time loop is the variable update that is performed by the compute kernels `compute_q!` and `update_C!`, accompanied by the imposing the Neumann boundary condition on the `C` field.
+
+```julia
+# action
+nt = 100
+for it in 1:nt
+    @printf("it = %d/%d \n", it, nt)
+    launch(arch, grid, compute_q! => (q, C, χ, grid))
+    launch(arch, grid, update_C! => (C, q, Δt, grid); bc=batch(grid, C => Neumann(); exchange=C))
+end
+```
+
+After running the simulation, you should see something like this, here the final result at `it = 100` for the field `C` is plotted.
+
+```@raw html
+<div style="text-align: center;">
+    <img src="assets/diffusion_2d_it_100.png" width="50%"/>
+</div>
+```
diff --git a/docs/src/index.md b/docs/src/index.md
index d5fa24c3..96272dc0 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,10 +1,41 @@
 # Chmy.jl
 
-Documentation for [Chmy.jl](https://github.com/PTsolvers/Chmy.jl).
+[Chmy.jl](https://github.com/PTsolvers/Chmy.jl) is a backend-agnostic toolkit for finite difference computations on multi-dimensional computational staggered grids. Chmy.jl features task-based distributed memory parallelisation capabilities.
 
-Information about the GPU4GEO project can be found on the [GPU4GEO website](https://ptsolvers.github.io/GPU4GEO/).
+## Installation
 
-## Doc content
+To install Chmy.jl, one can simply add it using the Julia package manager:
 
-- [Usage information](usage/runtests.md)
-- [API reference](lib/modules.md)
+```julia
+julia> using Pkg
+julia> Pkg.add("Chmy")
+```
+
+After the package is installed, one can load the package by using:
+
+```julia
+using Chmy
+```
+
+!!! info "Install from a Specific Branch" 
+    For developers and advanced users, one might want to use the implementation of Chmy.jl from a specific branch by specifying the url. In the following code snippet, we do this by explicitly specifying to use the current implementation that is available under the `main` branch:
+
+    ```julia
+    using Pkg; Pkg.add(url="https://github.com/PTsolvers/Chmy.jl#main")
+    ```
+
+## Feature Summary
+
+Chmy.jl provides a comprehensive framework for handling complex computational tasks on structured grids, leveraging both single and multi-device architectures. It seamlessly integrates with Julia's powerful parallel and concurrent programming capabilities, making it suitable for a wide range of scientific and engineering applications.
+
+A general list of the features is:
+
+- Distributed computing support with MPi.jl
+- Multi-dimensional, parameterizable discrete and continuous fields on structured grids
+- High-level interface for specifying boundary conditions with automatic batching for performance
+- Finite difference and interpolation operators on discrete fields
+- Extensibility. The whole package is written in pure Julia, so adding new functions, simplification rules, and model transformations has no barrier.
+
+## Funding
+
+The development of this package is supported by the [GPU4GEO PASC project](https://pasc-ch.org/projects/2021-2024/gpu4geo/index.html). More information about the GPU4GEO project can be found on the [GPU4GEO website](https://ptsolvers.github.io/GPU4GEO/).
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 2765ec75..1a76a34b 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -65,7 +65,8 @@ get_device(arch::SingleDeviceArchitecture) = arch.device
 """
     activate!(arch::SingleDeviceArchitecture; priority=:normal)
 
-Activate the given architecture on the specified device and set the priority of the backend.
+Activate the given architecture on the specified device and set the priority of the 
+backend. For the priority accepted values are `:normal`, `:low` and `:high`.
 """
 function activate!(arch::SingleDeviceArchitecture; priority=:normal)
     set_device!(arch.device)
diff --git a/src/BoundaryConditions/boundary_function.jl b/src/BoundaryConditions/boundary_function.jl
index 18899d45..7c492703 100644
--- a/src/BoundaryConditions/boundary_function.jl
+++ b/src/BoundaryConditions/boundary_function.jl
@@ -1,3 +1,8 @@
+"""
+    abstract type BoundaryFunction{F}
+    
+Abstract type for boundary condition functions with function type `F`.
+"""
 abstract type BoundaryFunction{F} end
 
 struct ReducedDimensions end
diff --git a/src/BoundaryConditions/first_order_boundary_condition.jl b/src/BoundaryConditions/first_order_boundary_condition.jl
index 19bb54aa..9132616e 100644
--- a/src/BoundaryConditions/first_order_boundary_condition.jl
+++ b/src/BoundaryConditions/first_order_boundary_condition.jl
@@ -1,15 +1,31 @@
 struct DirichletKind end
 struct NeumannKind end
 
+"""
+    struct FirstOrderBC{T,Kind} <: FieldBoundaryCondition
+
+A struct representing a boundary condition of first-order accuracy.
+"""
 struct FirstOrderBC{T,Kind} <: FieldBoundaryCondition
     value::T
     FirstOrderBC{Kind}(value::T) where {Kind,T} = new{T,Kind}(value)
 end
 
 const Dirichlet{T} = FirstOrderBC{T,DirichletKind}
-const Neumann{T}   = FirstOrderBC{T,NeumannKind}
+const Neumann{T} = FirstOrderBC{T,NeumannKind}
+
+"""
+    Dirichlet(value=nothing)
 
+Create a `Dirichlet` object representing the Dirichlet boundary condition with the specified value.
+"""
 Dirichlet(value=nothing) = FirstOrderBC{DirichletKind}(value)
+
+"""
+    Neumann(value=nothing)
+
+Create a `Neumann` object representing the Neumann boundary condition with the specified value.
+"""
 Neumann(value=nothing)   = FirstOrderBC{NeumannKind}(value)
 
 Base.show(io::IO, ::Dirichlet{Nothing}) = print(io, "Dirichlet(0)")
diff --git a/src/Distributed/distributed_architecture.jl b/src/Distributed/distributed_architecture.jl
index 9024fb39..de7a5356 100644
--- a/src/Distributed/distributed_architecture.jl
+++ b/src/Distributed/distributed_architecture.jl
@@ -30,9 +30,33 @@ function Architectures.Arch(backend::Backend, comm::MPI.Comm, dims; device_id=no
     return DistributedArchitecture(child_arch, topology)
 end
 
+"""
+    topology(arch::DistributedArchitecture)
+
+Get the virtual MPI topology of a distributed architecture
+"""
 topology(arch::DistributedArchitecture) = arch.topology
 
 # Implement Architecture API
+"""
+    get_backend(arch::DistributedArchitecture)
+
+Get the backend associated with a DistributedArchitecture by delegating to the child architecture.
+"""
 Architectures.get_backend(arch::DistributedArchitecture) = Architectures.get_backend(arch.child_arch)
+
+"""
+    get_device(arch::DistributedArchitecture)
+
+Get the device associated with a DistributedArchitecture by delegating to the child architecture.
+"""
 Architectures.get_device(arch::DistributedArchitecture) = get_device(arch.child_arch)
+
+"""
+    activate!(arch::DistributedArchitecture; kwargs...)
+
+Activate the given DistributedArchitecture by delegating to the child architecture, 
+and pass through any keyword arguments. For example, the priority can be set with 
+accepted values being `:normal`, `:low`, and `:high`.
+"""
 Architectures.activate!(arch::DistributedArchitecture; kwargs...) = activate!(arch.child_arch; kwargs...)
diff --git a/src/Distributed/exchange_halo.jl b/src/Distributed/exchange_halo.jl
index 73da1185..bca7a4d5 100644
--- a/src/Distributed/exchange_halo.jl
+++ b/src/Distributed/exchange_halo.jl
@@ -84,11 +84,26 @@ function exchange_halo!(arch::DistributedArchitecture, grid::StructuredGrid{N},
     return
 end
 
-# Adapter for BoundaryConditions
-function BoundaryConditions.bc!(side::Side, dim::Dim,
+"""
+    BoundaryConditions.bc!(side::Side, dim::Dim,
+                                arch::DistributedArchitecture,
+                                grid::StructuredGrid,
+                                batch::ExchangeBatch; kwargs...)
+
+Apply boundary conditions on a distrbuted grid with halo exchange performed internally.
+
+# Arguments
+- `side`: The side of the grid where the halo exchange is performed.
+- `dim`: The dimension along which the halo exchange is performed.
+- `arch`: The distributed architecture used for communication.
+- `grid`: The structured grid on which the halo exchange is performed.
+- `batch`: The batch set to apply boundary conditions to.
+"""
+function BoundaryConditions.bc!(side::Side, 
+                                dim::Dim,
                                 arch::DistributedArchitecture,
                                 grid::StructuredGrid,
                                 batch::ExchangeBatch; kwargs...)
     exchange_halo!(side, dim, arch, grid, batch.fields...; kwargs...)
     return
-end
+end
\ No newline at end of file
diff --git a/src/Fields/abstract_field.jl b/src/Fields/abstract_field.jl
index 6c637217..0369c0a3 100644
--- a/src/Fields/abstract_field.jl
+++ b/src/Fields/abstract_field.jl
@@ -1,3 +1,10 @@
+"""
+    abstract type AbstractField{T,N,L} <: AbstractArray{T,N}
+    
+Abstract type representing a field with data type `T`, number of dimensions `N`, location `L` where the field should be defined on.
+
+See also: abstract type [`ConstantField`](@ref)
+"""
 abstract type AbstractField{T,N,L} <: AbstractArray{T,N} end
 
 Base.@assume_effects :foldable halo(::AbstractArray) = 0
diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index ef72d247..25d5fc80 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -23,6 +23,13 @@ end
 
 _expand(rng::AbstractUnitRange, h) = (first(rng)-h):(last(rng)+h)
 
+"""
+    interior(f::Field; with_halo=false)
+
+Displays the field on the interior of the grid on which it is defined on.
+One could optionally specify to display the halo regions on the grid with
+`with_halo=true`.
+"""
 function interior(f::Field; with_halo=false)
     ax      = with_halo ? _expand.(axes(f), halo(f)) : axes(f)
     indices = broadcast(.+, 2 .* halo(f), ax)
@@ -39,7 +46,7 @@ Adapt.adapt_structure(to, f::Field{T,N,L,H}) where {T,N,L,H} = Field{L,H}(Adapt.
 
 Constructs a field on a structured grid at the specified location.
 
-Arguments:
+## Arguments:
 - `backend`: The backend to use for memory allocation.
 - `grid`: The structured grid on which the field is constructed.
 - `loc`: The location or locations on the grid where the field is constructed.
@@ -54,13 +61,51 @@ function Field(backend::Backend, grid::StructuredGrid{N}, loc::LocOrLocs{N}, typ
     return Field{typeof(loc),halo}(data, dims)
 end
 
+"""
+    Field(arch::Architecture, args...; kwargs...)
+
+Create a `Field` object on the specified architecture.
+
+## Arguments:
+- `arch::Architecture`: The architecture for which to create the `Field`.
+- `args...`: Additional positional arguments to pass to the `Field` constructor.
+- `kwargs...`: Additional keyword arguments to pass to the `Field` constructor.
+"""
 Field(arch::Architecture, args...; kwargs...) = Field(Architectures.get_backend(arch), args...; kwargs...)
 
 # set fields
 
+"""
+    set!(f::Field, val::Number)
+
+Set all elements of the `Field` `f` to the specified numeric value `val`.
+
+## Arguments:
+- `f::Field`: The `Field` object to be modified.
+- `val::Number`: The numeric value to set in the `Field`.
+"""
 set!(f::Field, val::Number) = (fill!(interior(f), val); nothing)
+
+"""
+    set!(f::Field, A::AbstractArray)
+
+Set the elements of the `Field` `f` using the values from the `AbstractArray` `A`.
+
+## Arguments:
+- `f::Field`: The `Field` object to be modified.
+- `A::AbstractArray`: The array whose values are to be copied to the `Field`.
+"""
 set!(f::Field, A::AbstractArray) = (copyto!(interior(f), A); nothing)
 
+"""
+    set!(f::Field, other::AbstractField)
+
+Set the elements of the `Field` `f` using the values from another `AbstractField` `other`.
+
+## Arguments:
+- `f::Field`: The destination `Field` object to be modified.
+- `other::AbstractField`: The source `AbstractField` whose values are to be copied to `f`.
+"""
 function set!(f::Field, other::AbstractField)
     dst = interior(f)
     src = interior(other)
@@ -102,6 +147,17 @@ set!(fs::NamedTuple{names,<:NTuple{N,Field}}, args...) where {names,N} = foreach
 # vector fields
 vector_location(::Val{dim}, ::Val{N}) where {dim,N} = ntuple(i -> i == dim ? Vertex() : Center(), Val(N))
 
+"""
+    VectorField(backend::Backend, grid::StructuredGrid{N}, args...; kwargs...) where {N}
+
+Create a vector field in the form of a `NamedTuple` on the given `grid` using the specified `backend`. With each component being a `Field`.
+
+## Arguments:
+- `backend::Backend`: The backend to be used for computation.
+- `grid::StructuredGrid{N}`: The structured grid defining the computational domain.
+- `args...`: Additional positional arguments to pass to the `Field` constructor.
+- `kwargs...`: Additional keyword arguments to pass to the `Field` constructor.
+"""
 function VectorField(backend::Backend, grid::StructuredGrid{N}, args...; kwargs...) where {N}
     coord_names = axes_names(grid)
     names = ntuple(i -> coord_names[i], Val(N))
@@ -112,13 +168,34 @@ function VectorField(backend::Backend, grid::StructuredGrid{N}, args...; kwargs.
     return NamedTuple{names}(values)
 end
 
-# tensor fields
+"""
+    TensorField(backend::Backend, grid::StructuredGrid{2}, args...; kwargs...)
+
+Create a 2D tensor field in the form of a named tuple on the given `grid` using the specified `backend`, with components `xx`, `yy`, and `xy` each being a `Field`.
+
+## Arguments:
+- `backend::Backend`: The backend to be used for computation.
+- `grid::StructuredGrid{2}`: The 2D structured grid defining the computational domain.
+- `args...`: Additional positional arguments to pass to the `Field` constructor.
+- `kwargs...`: Additional keyword arguments to pass to the `Field` constructor.
+"""
 function TensorField(backend::Backend, grid::StructuredGrid{2}, args...; kwargs...)
     (xx = Field(backend, grid, Center(), args...; kwargs...),
      yy = Field(backend, grid, Center(), args...; kwargs...),
      xy = Field(backend, grid, Vertex(), args...; kwargs...))
 end
 
+"""
+    TensorField(backend::Backend, grid::StructuredGrid{3}, args...; kwargs...)
+
+Create a 3D tensor field in the form of a named tuple on the given `grid` using the specified `backend`, with components `xx`, `yy`, `zz`, `xy`, `xz`, and `yz` each being a `Field`.
+
+## Arguments:
+- `backend::Backend`: The backend to be used for computation.
+- `grid::StructuredGrid{3}`: The 3D structured grid defining the computational domain.
+- `args...`: Additional positional arguments to pass to the `Field` constructor.
+- `kwargs...`: Additional keyword arguments to pass to the `Field` constructor.
+"""
 function TensorField(backend::Backend, grid::StructuredGrid{3}, args...; kwargs...)
     (xx = Field(backend, grid, Center(), args...; kwargs...),
      yy = Field(backend, grid, Center(), args...; kwargs...),
diff --git a/src/Fields/function_field.jl b/src/Fields/function_field.jl
index fae2e5be..f50fc3d6 100644
--- a/src/Fields/function_field.jl
+++ b/src/Fields/function_field.jl
@@ -20,6 +20,18 @@ struct FunctionField{T,N,L,CD,F,G,P} <: AbstractField{T,N,L}
     end
 end
 
+"""
+    FunctionField(func::F, grid::StructuredGrid{N}, loc; discrete=false, parameters=nothing) where {F,N}
+
+Create a `FunctionField` on the given `grid` using the specified function `func`.
+
+## Arguments:
+- `func::F`: The function used to generate the field values.
+- `grid::StructuredGrid{N}`: The structured grid defining the computational domain.
+- `loc`: The nodal location on the grid grid where the function field is defined on.
+- `discrete::Bool=false`: A flag indicating whether the field should be discrete. Defaults to `false`.
+- `parameters=nothing`: Additional parameters to be used by the function. Defaults to `nothing`.
+"""
 function FunctionField(func::F, grid::StructuredGrid{N}, loc; discrete=false, parameters=nothing) where {F,N}
     loc = expand_loc(Val(N), loc)
     L   = typeof(loc)
diff --git a/src/GridOperators/interpolation.jl b/src/GridOperators/interpolation.jl
index 3f2d49e7..5fd81aed 100644
--- a/src/GridOperators/interpolation.jl
+++ b/src/GridOperators/interpolation.jl
@@ -1,5 +1,11 @@
 # 1D interpolation rules
 
+"""
+    abstract type InterpolationRule
+
+A type representing an interpolation rule that specifies how
+the interpolant `f` should be reconstructed using a data set on a given grid.
+"""
 abstract type InterpolationRule end
 
 struct Linear <: InterpolationRule end
diff --git a/src/GridOperators/masked_operators.jl b/src/GridOperators/masked_operators.jl
index 20f158aa..95cd0fc6 100644
--- a/src/GridOperators/masked_operators.jl
+++ b/src/GridOperators/masked_operators.jl
@@ -1,3 +1,9 @@
+"""
+    abstract type AbstractMask{T,N}
+
+Abstract type representing the data transformation to be performed on elements
+in a field of dimension `N`, where each element is of type`T`.
+"""
 abstract type AbstractMask{T,N} end
 
 @propagate_inbounds at(ω::AbstractMask{T,N}, loc::NTuple{N,Location}, I::CartesianIndex{N}) where {T,N} = at(ω, loc, Tuple(I)...)
diff --git a/src/Grids/Grids.jl b/src/Grids/Grids.jl
index 4c577ce6..dd52af81 100644
--- a/src/Grids/Grids.jl
+++ b/src/Grids/Grids.jl
@@ -18,9 +18,25 @@ import Chmy: @add_cartesian
 
 import Base.@propagate_inbounds
 
+"""
+    abstract type Location
+
+Abstract type representing a location in a grid cell.
+"""
 abstract type Location end
 
+"""
+    struct Center <: Location
+
+The `Center` struct represents a location at the center along a dimension of a grid cell.
+"""
 struct Center <: Location end
+
+"""
+    struct Vertex <: Location
+
+The `Vertex` struct represents a location at the vertex along a dimension of a grid cell.
+"""
 struct Vertex <: Location end
 
 Base.@assume_effects :total flip(::Center) = Vertex()
@@ -28,6 +44,11 @@ Base.@assume_effects :total flip(::Vertex) = Center()
 
 Base.broadcastable(o::Location) = Ref(o)
 
+"""
+    abstract type Connectivity
+
+Abstract type representing the connectivity of grid elements.
+"""
 abstract type Connectivity end
 
 struct Bounded <: Connectivity end
@@ -43,7 +64,18 @@ include("uniform_axis.jl")
 include("function_axis.jl")
 include("structured_grid.jl")
 
+"""
+    Δ
+
+Alias for the `spacing` method that returns the spacing between grid points.
+"""
 const Δ  = spacing
+
+"""
+    iΔ
+
+Alias for the `inv_spacing` method that returns the reciprocal of the spacing between grid points.
+"""
 const iΔ = inv_spacing
 
 end # module Grids
diff --git a/src/Grids/abstract_axis.jl b/src/Grids/abstract_axis.jl
index 0fdf2878..d3d84df3 100644
--- a/src/Grids/abstract_axis.jl
+++ b/src/Grids/abstract_axis.jl
@@ -1,3 +1,8 @@
+"""
+    abstract type AbstractAxis{T}
+    
+Abstract type representing an axis in a grid, where the axis is parameterized by the type `T` of the coordinates.
+"""
 abstract type AbstractAxis{T} end
 
 Base.eltype(::AbstractAxis{T}) where {T} = T
diff --git a/src/Grids/structured_grid.jl b/src/Grids/structured_grid.jl
index 7dd05635..da70765b 100644
--- a/src/Grids/structured_grid.jl
+++ b/src/Grids/structured_grid.jl
@@ -223,8 +223,25 @@ direction(::SG, ::Val{:x}) = Dim(1)
 direction(::SG, ::Val{:y}) = Dim(2)
 direction(::SG, ::Val{:z}) = Dim(3)
 
+"""
+    axes_names(::SG{1})
+
+Returns the names of the axes for a 1-dimensional structured grid.
+"""
 axes_names(::SG{1}) = (:x,)
+
+"""
+    axes_names(::SG{2})
+
+Returns the names of the axes for a 2-dimensional structured grid.
+"""
 axes_names(::SG{2}) = (:x, :y)
+
+"""
+    axes_names(::SG{3})
+
+Returns the names of the axes for a 3-dimensional structured grid.
+"""
 axes_names(::SG{3}) = (:x, :y, :z)
 
 # cell volumes
diff --git a/src/KernelLaunch.jl b/src/KernelLaunch.jl
index f5720cd6..3e456fd8 100644
--- a/src/KernelLaunch.jl
+++ b/src/KernelLaunch.jl
@@ -11,12 +11,30 @@ using Chmy.Workers
 
 using KernelAbstractions
 
+"""
+    struct Launcher{Worksize,OuterWidth,Workers}
+
+A struct representing a launcher for asynchronous kernel execution.
+"""
 struct Launcher{Worksize,OuterWidth,Workers}
     workers::Workers
 end
 
-# worksize for the last dimension N takes into account only last outer width W[N], N-1 uses W[N] and W[N-1], N-2 uses W[N], W[N-1], and W[N-2]
+"""
+    Launcher(arch, grid; outer_width=nothing)
+
+Constructs a `Launcher` object configured based on the input parameters.
+
+## Arguments:
+- `arch`: The associated architecture.
+- `grid`: The grid defining the computational domain.
+- `outer_width`: Optional parameter specifying outer width.
+
+!!! warning
 
+    worksize for the last dimension N takes into account only last outer width 
+    W[N], N-1 uses W[N] and W[N-1], N-2 uses W[N], W[N-1], and W[N-2].
+"""
 function Launcher(arch, grid; outer_width=nothing)
     worksize = size(grid, Center()) .+ 2
 
@@ -66,6 +84,23 @@ Base.@assume_effects :foldable function outer_offset(launcher::Launcher, ::Dim{D
     end
 end
 
+"""
+    (launcher::Launcher)(arch::Architecture, grid, kernel_and_args::Pair{F,Args}; bc=nothing, async=false) where {F,Args}
+
+Launches a computational kernel using the specified `arch`, `grid`, `kernel_and_args`, and optional boundary conditions (`bc`).
+
+## Arguments:
+- `arch::Architecture`: The architecture on which to execute the computation.
+- `grid`: The grid defining the computational domain.
+- `kernel_and_args::Pair{F,Args}`: A pair consisting of the computational kernel `F` and its arguments `Args`.
+- `bc=nothing`: Optional boundary conditions for the computation.
+- `async=false`: If `true`, launches the kernel asynchronously.
+
+!!! warning
+    - `arch` should be compatible with the `Launcher`'s architecture.
+    - If `bc` is `nothing`, the kernel is launched without boundary conditions.
+    - If `async` is `false` (default), the function waits for the computation to complete before returning.
+"""
 function (launcher::Launcher)(arch::Architecture, grid, kernel_and_args::Pair{F,Args}; bc=nothing, async=false) where {F,Args}
     kernel, args = kernel_and_args