updated the Fibonacci device example to be roughly correct :)

TESSEorg · Mar 20, 2024 · 4d525ef · 4d525ef
1 parent 42ef7d8
commit 4d525ef
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 51 deletions.
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
@@ -16,6 +16,12 @@ list(APPEND ut_src fibonacci-coro.cc)
 list(APPEND ut_src device_coro.cc)
 if (TTG_HAVE_CUDA)
     list(APPEND ut_src cuda_kernel.cu)
+    # fibonacci device example
+    list(APPEND ut_src
+            fibonacci_device.cc
+            fibonacci_cuda_kernel.h
+            fibonacci_cuda_kernel.cc
+    )
 endif(TTG_HAVE_CUDA)
 list(APPEND ut_libs std::coroutine)
 
@@ -25,11 +31,10 @@ add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" CO
 add_ttg_executable(serialization serialization.cc unit_main.cpp
                    LINK_LIBRARIES Catch2::Catch2 ttg-serialization  $<TARGET_NAME_IF_EXISTS:BTAS::BTAS>
                    COMPILE_DEFINITIONS $<$<TARGET_EXISTS:BTAS::BTAS>:TTG_HAS_BTAS=1>)
-#target_link_libraries(serialization "Catch2::Catch2;ttg-serialization")
-#if (TARGET BTAS::BTAS)
-#    target_link_libraries(serialization BTAS::BTAS)
-#    target_compile_definitions(serialization PRIVATE TTG_HAS_BTAS=1)
-#endif (TARGET BTAS::BTAS)
+
+# Boost serialization test: checks low-level codegen
+add_ttg_executable(serialization_boost serialization_boost.cc
+        LINK_LIBRARIES ttg-serialization-boost RUNTIMES "parsec")
 
 # TODO: convert into unit test
 #if (TARGET MADworld)

diff --git a/tests/unit/fibonacci_cuda_kernel.cu b/tests/unit/fibonacci_cuda_kernel.cu
@@ -1,27 +1,15 @@
-#include "cuda_kernel.h"
+#include "fibonacci_cuda_kernel.h"
 
 #ifdef TTG_HAVE_CUDA
 
-__global__ void cu_calculate_fibonacci(int64_t* results, std::size_t n) {
-  int tx = threadIdx.x; // Thread index
-
-  if (tx == 0) {
-    int64_t a = 0, b = 1, c;
-    if (n == 0) {
-      results[tx] = a;
-      return;
-    }
-    for (int i = 2; i <= n; i++) {
-      c = a + b;
-      a = b;
-      b = c;
-    }
-    results[tx] = b;
-  }
+__global__ void cu_next_value(int64_t* fn_and_fnm1) {
+  int64_t fnp1 = fn_and_fnm1[0] + fn_and_fnm1[1];
+  fn_and_fnm1[1] = fn_and_fnm1[0];
+  fn_and_fnm1[0] = fnp1;
 }
 
-void calculate_fibonacci(int64_t* results, std::size_t n) {
-  cu_calculate_fibonacci<<<1, 1>>>(results, n); // Adjust <<<1, 1>>> as needed for parallel computation
+void next_value(int64_t* fn_and_fnm1) {
+  cu_next_value<<<1, 1>>>(fn_and_fnm1);
 }
 
 #endif // TTG_HAVE_CUDA
diff --git a/tests/unit/fibonacci_cuda_kernel.h b/tests/unit/fibonacci_cuda_kernel.h
@@ -1,4 +1,4 @@
 #include "ttg/config.h"
 #include <cinttypes>
 
-void calculate_fibonacci(int64_t* result, std::size_t n);
+void next_value(int64_t* fn_and_fnm1);
diff --git a/tests/unit/fibonacci_device.cc b/tests/unit/fibonacci_device.cc
@@ -4,23 +4,23 @@
 // Define TTG_USE_CUDA only if CUDA support is desired and available
 #ifdef TTG_USE_CUDA
 #include "cuda_runtime.h"
-#include "cuda_kernel.h"
+#include "fibonacci_cuda_kernel.h"
 #endif
 
 #include "ttg/serialization.h"
 
 // Default to CUDA if available, can be overridden by defining TTG_USE_XXX for other backends
 #define ES ttg::default_execution_space()
 
-struct A : public ttg::TTValue<A> {
-  int64_t value;
-  ttg::Buffer<int64_t> buffer;
+/// N.B. contains values of F_n and F_{n-1}
+struct Fn : public ttg::TTValue<Fn> {
+  int64_t F[2] = {1, 0};  // F[0] = F_n, F[1] = F_{n-1}
+  ttg::Buffer<int64_t> b;
 
-  A() : value(0), buffer(&value, 1) {}
-  A(int64_t val) : value(val), buffer(&value, 1) {}
+  Fn() : b(&F[0], 2) {}
 
-  A(A&& other) = default;
-  A& operator=(A&& other) = default;
+  Fn(Fn&& other) = default;
+  Fn& operator=(Fn&& other) = default;
 
   template <typename Archive>
   void serialize(Archive& ar) {
@@ -34,39 +34,41 @@ struct A : public ttg::TTValue<A> {
 
 int main(int argc, char* argv[]) {
   ttg::initialize(argc, argv, -1);
-  const int64_t N = 20;
+  const int64_t F_n_max = 1000;
 
-  ttg::Edge<int64_t, A> f2f;
-  ttg::Edge<void, A> f2p;
+  ttg::Edge<int64_t, Fn> f2f;
+  ttg::Edge<void, Fn> f2p;
 
   auto fib = ttg::make_tt<ES>(
-      [=](int64_t n, A& F_nms) -> ttg::device::Task {
-        if (n <= N) {
-          co_await ttg::device::select(F_nms.buffer);
-
-          int64_t result = calculate_fibonacci(n);
-
-          A F_n(result);
-          if (n < N) {
-            co_await ttg::device::send<0>(n + 1, F_n);
-          } else {
-            co_await ttg::device::sendv<1>(F_n);
-          }
+      [=](int64_t n, Fn& f_n) -> ttg::device::Task {
+        assert(n > 0);
+
+        co_await ttg::device::select(f_n.b);
+
+        next_value(f_n.b.current_device_ptr());
+
+        // wait for the task to complete and the values to be brought back to the host
+        co_await ttg::device::wait(f_n.b);
+
+        if (f_n.F[0] < F_n_max) {
+          co_await ttg::device::forward(ttg::device::send<0>(n + 1, f_n));
+        } else {
+          co_await ttg::device::forward(ttg::device::sendv<1>(f_n));
         }
       },
       ttg::edges(f2f),
       ttg::edges(f2f, f2p),
       "fib");
 
-  auto print = ttg::make_tt([](A F_N) {
-    std::cout << "The " << N << "th Fibonacci number is " << F_N.value << std::endl;
+  auto print = ttg::make_tt([](Fn f_n) {
+    std::cout << "The largest Fibonacci number smaller than" << F_n_max << " is " << f_n.F[1] << std::endl;
   },
                             ttg::edges(f2p),
                             ttg::edges(),
                             "print");
 
   ttg::make_graph_executable(fib.get());
-  if (ttg::default_execution_context().rank() == 0) fib->invoke(2, A(1));
+  if (ttg::default_execution_context().rank() == 0) fib->invoke(1, Fn{});
 
   ttg::execute(ttg_default_execution_context());
   ttg::fence(ttg_default_execution_context());