Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split up CUDA files #425

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@ fn main() {
let cores = num_cpus::get();
let tpcl2 = (cores as f64).log2().floor() as u32;

println!("cargo:rerun-if-changed=src/run.c");
println!("cargo:rerun-if-changed=src/hvm.c");
println!("cargo:rerun-if-changed=src/run.cu");
println!("cargo:rerun-if-changed=src/hvm.cu");
println!("cargo:rerun-if-changed=src/c");
println!("cargo:rerun-if-changed=src/cuda");

println!("cargo:rustc-link-arg=-rdynamic");

match cc::Build::new()
.file("src/run.c")
.file("src/c/run.c")
.opt_level(3)
.warnings(false)
.define("TPC_L2", &*tpcl2.to_string())
Expand All @@ -32,7 +31,7 @@ fn main() {

cc::Build::new()
.cuda(true)
.file("src/run.cu")
.file("src/cuda/run.cu")
.define("IO", None)
.flag("-diag-suppress=177") // variable was declared but never referenced
.flag("-diag-suppress=550") // variable was set but never used
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
184 changes: 184 additions & 0 deletions src/cuda/alloc.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
// Functions to allocate variables and nodes.

#ifndef alloc_cuh_INCLUDED
#define alloc_cuh_INCLUDED

#include "common.cuh"
#include "config.cuh"
#include "structs/vnet.cuh"
#include <stdio.h>

// Local Net Allocations
// ---------------------

template <typename A>
__device__ u32 g_alloc_1(Net* net, u32* g_put, A* g_buf) {
u32 lps = 0;
while (true) {
u32 lc = GID()*(G_NODE_LEN/TPG) + (*g_put%(G_NODE_LEN/TPG));
A elem = g_buf[lc];
*g_put += 1;
if (lc >= L_NODE_LEN && elem == 0) {
return lc;
}
if (++lps >= G_NODE_LEN/TPG) printf("OOM\n"); // FIXME: remove
//assert(++lps < G_NODE_LEN/TPG); // FIXME: enable?
}
}

template <typename A>
__device__ u32 g_alloc(Net* net, u32* ret, u32* g_put, A* g_buf, u32 num) {
u32 got = 0;
u32 lps = 0;
while (got < num) {
u32 lc = GID()*(G_NODE_LEN/TPG) + (*g_put%(G_NODE_LEN/TPG));
A elem = g_buf[lc];
*g_put += 1;
if (lc >= L_NODE_LEN && elem == 0) {
ret[got++] = lc;
}

if (++lps >= G_NODE_LEN/TPG) printf("OOM\n"); // FIXME: remove
//assert(++lps < G_NODE_LEN/TPG); // FIXME: enable?
}
return got;

}

template <typename A>
__device__ u32 l_alloc(Net* net, u32* ret, u32* l_put, A* l_buf, u32 num) {
u32 got = 0;
u32 lps = 0;
while (got < num) {
u32 lc = ((*l_put)++ * TPB) % L_NODE_LEN + TID();
A elem = l_buf[lc];
if (++lps >= L_NODE_LEN/TPB) {
break;
}
if (lc > 0 && elem == 0) {
ret[got++] = lc;
}
}
return got;
}

template <typename A>
__device__ u32 l_alloc_1(Net* net, u32* ret, u32* l_put, A* l_buf, u32* lps) {
u32 got = 0;
while (true) {
u32 lc = ((*l_put)++ * TPB) % L_NODE_LEN + TID();
A elem = l_buf[lc];
if (++(*lps) >= L_NODE_LEN/TPB) {
break;
}
if (lc > 0 && elem == 0) {
return lc;
}
}
return got;
}

__device__ u32 g_node_alloc_1(Net* net) {
return g_alloc_1(net, net->g_node_put, net->g_node_buf);
}

__device__ u32 g_vars_alloc_1(Net* net) {
return g_alloc_1(net, net->g_vars_put, net->g_vars_buf);
}

__device__ u32 g_node_alloc(Net* net, TM* tm, u32 num) {
return g_alloc(net, tm->nloc, net->g_node_put, net->g_node_buf, num);
}

__device__ u32 g_vars_alloc(Net* net, TM* tm, u32 num) {
return g_alloc(net, tm->vloc, net->g_vars_put, net->g_vars_buf, num);
}

__device__ u32 l_node_alloc(Net* net, TM* tm, u32 num) {
return l_alloc(net, tm->nloc, &tm->nput, net->l_node_buf, num);
}

__device__ u32 l_vars_alloc(Net* net, TM* tm, u32 num) {
return l_alloc(net, tm->vloc, &tm->vput, net->l_vars_buf, num);
}

__device__ u32 l_node_alloc_1(Net* net, TM* tm, u32* lps) {
return l_alloc_1(net, tm->nloc, &tm->nput, net->l_node_buf, lps);
}

__device__ u32 l_vars_alloc_1(Net* net, TM* tm, u32* lps) {
return l_alloc_1(net, tm->vloc, &tm->vput, net->l_vars_buf, lps);
}

__device__ u32 node_alloc_1(Net* net, TM* tm, u32* lps) {
if (tm->mode != WORK) {
return g_node_alloc_1(net);
} else {
return l_node_alloc_1(net, tm, lps);
}
}

__device__ u32 vars_alloc_1(Net* net, TM* tm, u32* lps) {
if (tm->mode != WORK) {
return g_vars_alloc_1(net);
} else {
return l_vars_alloc_1(net, tm, lps);
}
}

// Adjusts a newly allocated port.
__device__ inline Port adjust_port(Net* net, TM* tm, Port port) {
Tag tag = get_tag(port);
Val val = get_val(port);
if (is_nod(port)) return new_port(tag, tm->nloc[val]);
if (is_var(port)) return new_port(tag, tm->vloc[val]);
return new_port(tag, val);
}

// Adjusts a newly allocated pair.
__device__ inline Pair adjust_pair(Net* net, TM* tm, Pair pair) {
Port p1 = adjust_port(net, tm, get_fst(pair));
Port p2 = adjust_port(net, tm, get_snd(pair));
return new_pair(p1, p2);
}

// Gets the necessary resources for an interaction.
__device__ bool get_resources(Net* net, TM* tm, u32 need_rbag, u32 need_node, u32 need_vars) {
u32 got_rbag = min(RLEN - tm->rbag.lo_end, RLEN - tm->rbag.hi_end);
u32 got_node;
u32 got_vars;
if (tm->mode != WORK) {
got_node = g_node_alloc(net, tm, need_node);
got_vars = g_vars_alloc(net, tm, need_vars);
} else {
got_node = l_node_alloc(net, tm, need_node);
got_vars = l_vars_alloc(net, tm, need_vars);
}
return got_rbag >= need_rbag && got_node >= need_node && got_vars >= need_vars;
}

// Global Net Allocations
// ----------------------

// Creates a node.
__global__ void make_node(GNet* gnet, Tag tag, Port fst, Port snd, Port* ret) {
if (GID() == 0) {
Net net = vnet_new(gnet, NULL, gnet->turn);
u32 loc = g_node_alloc_1(&net);
node_create(&net, loc, new_pair(fst, snd));
*ret = new_port(tag, loc);
}
}

// Allocs and creates a node, returning its port.
Port gnet_make_node(GNet* gnet, Tag tag, Port fst, Port snd) {
Port ret;
Port* d_ret;
cudaMalloc(&d_ret, sizeof(Port));
make_node<<<1,1>>>(gnet, tag, fst, snd, d_ret);
cudaMemcpy(&ret, d_ret, sizeof(Port), cudaMemcpyDeviceToHost);
cudaFree(d_ret);
return ret;
}

#endif // alloc_cuh_INCLUDED
80 changes: 80 additions & 0 deletions src/cuda/common.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Common type aliases and functions used across the codebase that aren't
// necessarily associated with any specific part.

#ifndef common_cuh_INCLUDED
#define common_cuh_INCLUDED

#include <stdint.h>

// Numeric Type Aliases
// --------------------
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef unsigned long long int u64;
typedef int32_t i32;
typedef float f32;
typedef double f64;

// Values
// ------
// Val ::= 29-bit (rounded up to u32)
// The 29 least significant bits in a Port
typedef u32 Val;

// Tags
// ----
// Tag ::= 3-bit (rounded up to u8)
// These are the 3 most significant bits in a Port, and
// they identify the type of port.
typedef u8 Tag;

const Tag VAR = 0x0; // variable
const Tag REF = 0x1; // reference
const Tag ERA = 0x2; // eraser
const Tag NUM = 0x3; // number
const Tag CON = 0x4; // constructor
const Tag DUP = 0x5; // duplicator
const Tag OPR = 0x6; // operator
const Tag SWI = 0x7; // switch

// Ports
// -----
// Port ::= Tag + Val (fits a u32)
typedef u32 Port;

/// Pair
/// ----
// Pair ::= Port + Port (fits a u64)
typedef u64 Pair;

// Interaction Rules
// -----------------
// Rule ::= 3-bit (rounded up to 8)
typedef u8 Rule;

const Rule LINK = 0x0;
const Rule CALL = 0x1;
const Rule VOID = 0x2;
const Rule ERAS = 0x3;
const Rule ANNI = 0x4;
const Rule COMM = 0x5;
const Rule OPER = 0x6;
const Rule SWIT = 0x7;

// Grid Functions
// --------------

__device__ inline u32 TID() {
return threadIdx.x;
}

__device__ inline u32 BID() {
return blockIdx.x;
}

__device__ inline u32 GID() {
return TID() + BID() * blockDim.x;
}

#endif // common_cuh_INCLUDED
32 changes: 32 additions & 0 deletions src/cuda/config.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef config_cuh_INCLUDED
#define config_cuh_INCLUDED

#include "common.cuh"

// Clocks per Second
const u64 S = 2520000000;

// Threads per Block
const u32 TPB_L2 = 7;
const u32 TPB = 1 << TPB_L2;

// Blocks per GPU
const u32 BPG_L2 = 7;
const u32 BPG = 1 << BPG_L2;

// Threads per GPU
const u32 TPG = TPB * BPG;

// Thread Redex Bag Length
const u32 RLEN = 256;

// Local Net
const u32 L_NODE_LEN = 0x2000;
const u32 L_VARS_LEN = 0x2000;

// Global Net
const u32 G_NODE_LEN = 1 << 29; // max 536m nodes
const u32 G_VARS_LEN = 1 << 29; // max 536m vars
const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes

#endif // config_cuh_INCLUDED
16 changes: 16 additions & 0 deletions src/cuda/constants.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef constants_cuh_INCLUDED
#define constants_cuh_INCLUDED

#include "common.cuh"

// Special Substitution Map Values
const Port FREE = 0x00000000;
const Port ROOT = 0xFFFFFFF8;
const Port NONE = 0xFFFFFFFF;

// Evaluation Modes
const u8 SEED = 0;
const u8 GROW = 1;
const u8 WORK = 2;

#endif // constants_cuh_INCLUDED
Loading
Loading