HigherOrderCO · enricozb · Aug 24, 2024 · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024
diff --git a/build.rs b/build.rs
@@ -2,14 +2,13 @@ fn main() {
   let cores = num_cpus::get();
   let tpcl2 = (cores as f64).log2().floor() as u32;
 
-  println!("cargo:rerun-if-changed=src/run.c");
-  println!("cargo:rerun-if-changed=src/hvm.c");
-  println!("cargo:rerun-if-changed=src/run.cu");
-  println!("cargo:rerun-if-changed=src/hvm.cu");
+  println!("cargo:rerun-if-changed=src/c");
+  println!("cargo:rerun-if-changed=src/cuda");
+
   println!("cargo:rustc-link-arg=-rdynamic");
 
   match cc::Build::new()
-      .file("src/run.c")
+      .file("src/c/run.c")
       .opt_level(3)
       .warnings(false)
       .define("TPC_L2", &*tpcl2.to_string())
@@ -32,7 +31,7 @@ fn main() {
 
     cc::Build::new()
       .cuda(true)
-      .file("src/run.cu")
+      .file("src/cuda/run.cu")
       .define("IO", None)
       .flag("-diag-suppress=177") // variable was declared but never referenced
       .flag("-diag-suppress=550") // variable was set but never used

diff --git a/src/hvm.c → src/c/hvm.c b/src/hvm.c → src/c/hvm.c
diff --git a/src/hvm.h → src/c/hvm.h b/src/hvm.h → src/c/hvm.h
diff --git a/src/run.c → src/c/run.c b/src/run.c → src/c/run.c
diff --git a/src/cuda/alloc.cuh b/src/cuda/alloc.cuh
@@ -0,0 +1,184 @@
+// Functions to allocate variables and nodes.
+
+#ifndef alloc_cuh_INCLUDED
+#define alloc_cuh_INCLUDED
+
+#include "common.cuh"
+#include "config.cuh"
+#include "structs/vnet.cuh"
+#include <stdio.h>
+
+// Local Net Allocations
+// ---------------------
+
+template <typename A>
+__device__ u32 g_alloc_1(Net* net, u32* g_put, A* g_buf) {
+  u32 lps = 0;
+  while (true) {
+    u32 lc = GID()*(G_NODE_LEN/TPG) + (*g_put%(G_NODE_LEN/TPG));
+    A elem = g_buf[lc];
+    *g_put += 1;
+    if (lc >= L_NODE_LEN && elem == 0) {
+      return lc;
+    }
+    if (++lps >= G_NODE_LEN/TPG) printf("OOM\n"); // FIXME: remove
+    //assert(++lps < G_NODE_LEN/TPG); // FIXME: enable?
+  }
+}
+
+template <typename A>
+__device__ u32 g_alloc(Net* net, u32* ret, u32* g_put, A* g_buf, u32 num) {
+  u32 got = 0;
+  u32 lps = 0;
+  while (got < num) {
+    u32 lc = GID()*(G_NODE_LEN/TPG) + (*g_put%(G_NODE_LEN/TPG));
+    A elem = g_buf[lc];
+    *g_put += 1;
+    if (lc >= L_NODE_LEN && elem == 0) {
+      ret[got++] = lc;
+    }
+
+    if (++lps >= G_NODE_LEN/TPG) printf("OOM\n"); // FIXME: remove
+    //assert(++lps < G_NODE_LEN/TPG); // FIXME: enable?
+  }
+  return got;
+
+}
+
+template <typename A>
+__device__ u32 l_alloc(Net* net, u32* ret, u32* l_put, A* l_buf, u32 num) {
+  u32 got = 0;
+  u32 lps = 0;
+  while (got < num) {
+    u32 lc = ((*l_put)++ * TPB) % L_NODE_LEN + TID();
+    A elem = l_buf[lc];
+    if (++lps >= L_NODE_LEN/TPB) {
+      break;
+    }
+    if (lc > 0 && elem == 0) {
+      ret[got++] = lc;
+    }
+  }
+  return got;
+}
+
+template <typename A>
+__device__ u32 l_alloc_1(Net* net, u32* ret, u32* l_put, A* l_buf, u32* lps) {
+  u32 got = 0;
+  while (true) {
+    u32 lc = ((*l_put)++ * TPB) % L_NODE_LEN + TID();
+    A elem = l_buf[lc];
+    if (++(*lps) >= L_NODE_LEN/TPB) {
+      break;
+    }
+    if (lc > 0 && elem == 0) {
+      return lc;
+    }
+  }
+  return got;
+}
+
+__device__ u32 g_node_alloc_1(Net* net) {
+  return g_alloc_1(net, net->g_node_put, net->g_node_buf);
+}
+
+__device__ u32 g_vars_alloc_1(Net* net) {
+  return g_alloc_1(net, net->g_vars_put, net->g_vars_buf);
+}
+
+__device__ u32 g_node_alloc(Net* net, TM* tm, u32 num) {
+  return g_alloc(net, tm->nloc, net->g_node_put, net->g_node_buf, num);
+}
+
+__device__ u32 g_vars_alloc(Net* net, TM* tm, u32 num) {
+  return g_alloc(net, tm->vloc, net->g_vars_put, net->g_vars_buf, num);
+}
+
+__device__ u32 l_node_alloc(Net* net, TM* tm, u32 num) {
+  return l_alloc(net, tm->nloc, &tm->nput, net->l_node_buf, num);
+}
+
+__device__ u32 l_vars_alloc(Net* net, TM* tm, u32 num) {
+  return l_alloc(net, tm->vloc, &tm->vput, net->l_vars_buf, num);
+}
+
+__device__ u32 l_node_alloc_1(Net* net, TM* tm, u32* lps) {
+  return l_alloc_1(net, tm->nloc, &tm->nput, net->l_node_buf, lps);
+}
+
+__device__ u32 l_vars_alloc_1(Net* net, TM* tm, u32* lps) {
+  return l_alloc_1(net, tm->vloc, &tm->vput, net->l_vars_buf, lps);
+}
+
+__device__ u32 node_alloc_1(Net* net, TM* tm, u32* lps) {
+  if (tm->mode != WORK) {
+    return g_node_alloc_1(net);
+  } else {
+    return l_node_alloc_1(net, tm, lps);
+  }
+}
+
+__device__ u32 vars_alloc_1(Net* net, TM* tm, u32* lps) {
+  if (tm->mode != WORK) {
+    return g_vars_alloc_1(net);
+  } else {
+    return l_vars_alloc_1(net, tm, lps);
+  }
+}
+
+// Adjusts a newly allocated port.
+__device__ inline Port adjust_port(Net* net, TM* tm, Port port) {
+  Tag tag = get_tag(port);
+  Val val = get_val(port);
+  if (is_nod(port)) return new_port(tag, tm->nloc[val]);
+  if (is_var(port)) return new_port(tag, tm->vloc[val]);
+  return new_port(tag, val);
+}
+
+// Adjusts a newly allocated pair.
+__device__ inline Pair adjust_pair(Net* net, TM* tm, Pair pair) {
+  Port p1 = adjust_port(net, tm, get_fst(pair));
+  Port p2 = adjust_port(net, tm, get_snd(pair));
+  return new_pair(p1, p2);
+}
+
+// Gets the necessary resources for an interaction.
+__device__ bool get_resources(Net* net, TM* tm, u32 need_rbag, u32 need_node, u32 need_vars) {
+  u32 got_rbag = min(RLEN - tm->rbag.lo_end, RLEN - tm->rbag.hi_end);
+  u32 got_node;
+  u32 got_vars;
+  if (tm->mode != WORK) {
+    got_node = g_node_alloc(net, tm, need_node);
+    got_vars = g_vars_alloc(net, tm, need_vars);
+  } else {
+    got_node = l_node_alloc(net, tm, need_node);
+    got_vars = l_vars_alloc(net, tm, need_vars);
+  }
+  return got_rbag >= need_rbag && got_node >= need_node && got_vars >= need_vars;
+}
+
+// Global Net Allocations
+// ----------------------
+
+// Creates a node.
+__global__ void make_node(GNet* gnet, Tag tag, Port fst, Port snd, Port* ret) {
+  if (GID() == 0) {
+    Net net = vnet_new(gnet, NULL, gnet->turn);
+    u32 loc = g_node_alloc_1(&net);
+    node_create(&net, loc, new_pair(fst, snd));
+    *ret = new_port(tag, loc);
+  }
+}
+
+// Allocs and creates a node, returning its port.
+Port gnet_make_node(GNet* gnet, Tag tag, Port fst, Port snd) {
+  Port ret;
+  Port* d_ret;
+  cudaMalloc(&d_ret, sizeof(Port));
+  make_node<<<1,1>>>(gnet, tag, fst, snd, d_ret);
+  cudaMemcpy(&ret, d_ret, sizeof(Port), cudaMemcpyDeviceToHost);
+  cudaFree(d_ret);
+  return ret;
+}
+
+#endif // alloc_cuh_INCLUDED
diff --git a/src/cuda/common.cuh b/src/cuda/common.cuh
@@ -0,0 +1,80 @@
+// Common type aliases and functions used across the codebase that aren't
+// necessarily associated with any specific part.
+
+#ifndef common_cuh_INCLUDED
+#define common_cuh_INCLUDED
+
+#include <stdint.h>
+
+// Numeric Type Aliases
+// --------------------
+typedef  uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef unsigned long long int u64;
+typedef  int32_t i32;
+typedef    float f32;
+typedef   double f64;
+
+// Values
+// ------
+// Val ::= 29-bit (rounded up to u32)
+// The 29 least significant bits in a Port
+typedef u32 Val;
+
+// Tags
+// ----
+// Tag ::= 3-bit (rounded up to u8)
+// These are the 3 most significant bits in a Port, and
+// they identify the type of port.
+typedef u8 Tag;
+
+const Tag VAR = 0x0; // variable
+const Tag REF = 0x1; // reference
+const Tag ERA = 0x2; // eraser
+const Tag NUM = 0x3; // number
+const Tag CON = 0x4; // constructor
+const Tag DUP = 0x5; // duplicator
+const Tag OPR = 0x6; // operator
+const Tag SWI = 0x7; // switch
+
+// Ports
+// -----
+// Port ::= Tag + Val (fits a u32)
+typedef u32 Port;
+
+/// Pair
+/// ----
+// Pair ::= Port + Port (fits a u64)
+typedef u64 Pair;
+
+// Interaction Rules
+// -----------------
+// Rule ::= 3-bit (rounded up to 8)
+typedef u8 Rule;
+
+const Rule LINK = 0x0;
+const Rule CALL = 0x1;
+const Rule VOID = 0x2;
+const Rule ERAS = 0x3;
+const Rule ANNI = 0x4;
+const Rule COMM = 0x5;
+const Rule OPER = 0x6;
+const Rule SWIT = 0x7;
+
+// Grid Functions
+// --------------
+
+__device__ inline u32 TID() {
+  return threadIdx.x;
+}
+
+__device__ inline u32 BID() {
+  return blockIdx.x;
+}
+
+__device__ inline u32 GID() {
+  return TID() + BID() * blockDim.x;
+}
+
+#endif // common_cuh_INCLUDED
diff --git a/src/cuda/config.cuh b/src/cuda/config.cuh
@@ -0,0 +1,32 @@
+#ifndef config_cuh_INCLUDED
+#define config_cuh_INCLUDED
+
+#include "common.cuh"
+
+// Clocks per Second
+const u64 S = 2520000000;
+
+// Threads per Block
+const u32 TPB_L2 = 7;
+const u32 TPB    = 1 << TPB_L2;
+
+// Blocks per GPU
+const u32 BPG_L2 = 7;
+const u32 BPG    = 1 << BPG_L2;
+
+// Threads per GPU
+const u32 TPG = TPB * BPG;
+
+// Thread Redex Bag Length
+const u32 RLEN = 256;
+
+// Local Net
+const u32 L_NODE_LEN = 0x2000;
+const u32 L_VARS_LEN = 0x2000;
+
+// Global Net
+const u32 G_NODE_LEN = 1 << 29; // max 536m nodes
+const u32 G_VARS_LEN = 1 << 29; // max 536m vars
+const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes
+
+#endif // config_cuh_INCLUDED
diff --git a/src/cuda/constants.cuh b/src/cuda/constants.cuh
@@ -0,0 +1,16 @@
+#ifndef constants_cuh_INCLUDED
+#define constants_cuh_INCLUDED
+
+#include "common.cuh"
+
+// Special Substitution Map Values
+const Port FREE = 0x00000000;
+const Port ROOT = 0xFFFFFFF8;
+const Port NONE = 0xFFFFFFFF;
+
+// Evaluation Modes
+const u8 SEED = 0;
+const u8 GROW = 1;
+const u8 WORK = 2;
+
+#endif // constants_cuh_INCLUDED