Skip to content

Commit

Permalink
Merge pull request #28 from THU-DSP-LAB/27-over-4G
Browse files Browse the repository at this point in the history
[BUG] Run workgroups one by one
  • Loading branch information
yangzexia authored Mar 7, 2024
2 parents 2dd37c0 + 06bd5ba commit c5aa3eb
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 60 deletions.
2 changes: 2 additions & 0 deletions fesvr/syscall.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,15 @@ syscall_t::syscall_t(htif_t* htif)

register_command(0, std::bind(&syscall_t::handle_syscall, this, _1), "syscall");

#if 0
int stdin_fd = dup(0), stdout_fd0 = dup(1), stdout_fd1 = dup(1);
if (stdin_fd < 0 || stdout_fd0 < 0 || stdout_fd1 < 0)
throw std::runtime_error("could not dup stdin/stdout");

fds.alloc(stdin_fd); // stdin -> stdin
fds.alloc(stdout_fd0); // stdout -> stdout
fds.alloc(stdout_fd1); // stderr -> stdout
#endif
}

std::string syscall_t::do_chroot(const char* fn)
Expand Down
2 changes: 1 addition & 1 deletion riscv/insns/endprg.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
p->get_sim()->modify_reach_end();
p->gpgpu_unit.w->set_barrier_2(p->get_csr(CSR_WID));
if(p->get_sim()->get_reach_end()){
std::cout<<"all warps reach the endprg. now proc 0 will end the simulation."<<std::endl;
// std::cout<<"all warps reach the endprg. now proc 0 will end the simulation."<<std::endl;
p->get_sim()->append_reach_end();
//return 0;
}
2 changes: 1 addition & 1 deletion riscv/log_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class log_file_t
if (!path)
return;

wrapped_file.reset(fopen(path, "w"));
wrapped_file.reset(fopen(path, "a"));
if (! wrapped_file) {
std::ostringstream oss;
oss << "Failed to open log file at `" << path << "': "
Expand Down
10 changes: 8 additions & 2 deletions riscv/processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1285,6 +1285,7 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s)
uint64_t pdssize=0;
uint64_t pdsbase=0x78000000;
uint64_t knlbase=0x80000000;
uint64_t currwgid=0;
size_t kernel_size[3]={0,1,1};

while (pos < len) {
Expand Down Expand Up @@ -1314,6 +1315,8 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s)
pdsbase = get_long_token(str,',',pos);
else if (attr == "knlbase")
knlbase = get_long_token(str,',',pos);
else if (attr == "currwgid")
currwgid = get_long_token(str,',',pos);
else
bad_gpgpuarch_string(s, "Unsupported token");
++pos;
Expand All @@ -1332,6 +1335,7 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s)
pds_size = pdssize == 0 ? (numw * numt )<< 10 : pdssize;
pds_base = pdsbase;
knl_base = knlbase;
curr_wgid = currwgid;

kernel_size[0] = kernel_size[0]==0 ? (numwg/(kernel_size[1]*kernel_size[2])) : kernel_size[0];
workgroup_size_x=kernel_size[0];
Expand All @@ -1342,7 +1346,9 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s)
bad_gpgpuarch_string(s, "kernel size doesn't match total wg size");
}

std::cout << std::dec<<"warp number: " << warp_number << " thread number = " << thread_number << " workgroup number = "<< workgroup_number \
/*
std::cout << "--- warp number: " << warp_number << " thread number = " << thread_number << " workgroup number = "<< workgroup_number \
<<" workgroup dimension:"<<kernel_size[0]<<"*"<<kernel_size[1]<<"*"<<kernel_size[2] \
<<std::hex<<" lds size: 0x"<<lds_size<<" pds size: 0x"<<pds_size<<" lds base: 0x"<<lds_base<<" pds base: 0x"<<pds_base<<" knl base: 0x"<<knl_base << std::endl;
<<std::hex<<" lds size: "<<lds_size<<" pds size: "<<pds_size<<" lds base: "<<lds_base<<" pds base: "<<pds_base \
<<" knl base: "<<knl_base << " current workgroup id: " << curr_wgid << std::endl;*/
}
2 changes: 1 addition & 1 deletion riscv/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class warp_schedule_t
std::vector<int> barriers;
bool is_all_true;
int barrier_counter;
uint64_t lds_base,lds_size,pds_base,pds_size,knl_base;
uint64_t lds_base,lds_size,pds_base,pds_size,knl_base, curr_wgid;
};

struct insn_desc_t //mask
Expand Down
53 changes: 36 additions & 17 deletions riscv/sim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,24 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
uint64_t pds_size=w.pds_size;
uint64_t lds_size=w.lds_size;

uint64_t pds=pds_base;
uint64_t lds=lds_base;
uint64_t pds = pds_base;
uint64_t lds = lds_base;

w.workgroup_number = 1;

uint64_t spike_curr_wgid = w.curr_wgid;

assert(spike_curr_wgid < w.workgroup_size_x * w.workgroup_size_y * w.workgroup_size_z);
gidz = spike_curr_wgid / (w.workgroup_size_x * w.workgroup_size_y);
gidy = (spike_curr_wgid % (w.workgroup_size_x * w.workgroup_size_y)) / w.workgroup_size_x;
gidx = (spike_curr_wgid % (w.workgroup_size_x * w.workgroup_size_y)) % w.workgroup_size_x;
//printf("simt() current_wgid is %ld, gidx %ld, gidy %ld, gidz %ld\n", spike_curr_wgid, gidx, gidy, gidz);

workgroups = new warp_schedule_t[w.workgroup_number];
for (size_t i=0;i<w.workgroup_number;i++){
for (size_t i=0;i<w.workgroup_number;i++) {
assert(w.warp_number>0 & w.workgroup_number>0 & w.thread_number>0);
assert(w.warp_number * w.workgroup_number == cfg->nprocs());
workgroups[i].set_warp_schedule(w.warp_number,w.thread_number,w.workgroup_number,i);
workgroups[i].set_warp_schedule(w.warp_number,w.thread_number,w.workgroup_number, spike_curr_wgid);

for (size_t j = 0; j < w.warp_number; j++) {
reach_end[i*w.warp_number+j] = i*w.warp_number+j;
Expand All @@ -118,26 +128,33 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,

procs[i*w.warp_number+j]->gpgpu_unit.set_warp(&workgroups[i]);//workgroups[i]);
//现在一个warp就是一个core
procs[i*w.warp_number+j]->gpgpu_unit.init_warp(w.warp_number, w.thread_number, j*w.thread_number, i,j, pds, lds, knl_base, gidx,gidy,gidz);
procs[i*w.warp_number+j]->gpgpu_unit.init_warp(w.warp_number, w.thread_number,
j * w.thread_number, spike_curr_wgid, j, pds, lds, knl_base, gidx, gidy, gidz);
assert(w.thread_number == (procs[i]->VU.get_vlen() / procs[i]->VU.get_elen()));
}

gidx=gidx+1;
if(gidx==w.workgroup_size_x){
gidx=0;
gidy=gidy+1;
if(gidy==w.workgroup_size_y){
gidy=0;
gidz=gidz+1;
if(gidz==w.workgroup_size_z){gidz=0;}
}
#if 0
gidx = gidx+1;
if(gidx == w.workgroup_size_x) {
gidx = 0;
gidy = gidy + 1;

if(gidy == w.workgroup_size_y) {
gidy = 0;
gidz = gidz+1;

if(gidz == w.workgroup_size_z) {
gidz = 0;
}
}
}
pds=pds+pds_size;
lds=lds+lds_size;
#endif
pds = pds + pds_size;
lds = lds + lds_size;
}



#if 0
make_dtb();

void *fdt = (void *)dtb.c_str();
Expand Down Expand Up @@ -224,6 +241,8 @@ sim_t::sim_t(const cfg_t *cfg, bool halted,
<< nprocs() << ").\n";
exit(1);
}
#endif

}

sim_t::~sim_t()
Expand Down
92 changes: 54 additions & 38 deletions spike_main/spike_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ int spike_device::set_filename(const char* filename,const char* logname){
return 0;
}


#define SPIKE_RUN_WG_NUM 1
int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){
uint64_t num_warp=knl_data->wg_size;
uint64_t num_thread=knl_data->wf_size;
Expand All @@ -354,10 +354,12 @@ int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){
uint64_t num_workgroup=num_workgroup_x*num_workgroup_y*num_workgroup_z;
uint64_t num_processor=num_warp*num_workgroup;
uint64_t ldssize=knl_data->ldsSize;
uint64_t pdssize=knl_data->pdsSize*num_thread;
//uint64_t pdssize=knl_data->pdsSize * num_thread;
uint64_t pdssize = 0x10000000;
uint64_t pdsbase=knl_data->pdsBaseAddr;
uint64_t start_pc=knl_start_pc;
uint64_t knlbase=knl_data->metaDataBaseAddr;
uint64_t currwgid = 0;

if ((ldssize)>0x10000000) {
fprintf(stderr, "lds size is too large. please modify VBASEADDR");
Expand Down Expand Up @@ -545,8 +547,8 @@ int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){
char arg_logfilename[64];
sprintf(arg_logfilename,"--log=%s",logfilename);
sprintf(arg_num_core,"-p%ld",num_processor);
sprintf(arg_gpgpu,"numw:%ld,numt:%ld,numwg:%ld,kernelx:%ld,kernely:%ld,kernelz:%ld,ldssize:0x%lx,pdssize:0x%lx,pdsbase:0x%lx,knlbase:0x%lx",\
num_warp,num_thread,num_workgroup,num_workgroup_x,num_workgroup_y,num_workgroup_z,ldssize,pdssize,pdsbase,knlbase);
sprintf(arg_gpgpu,"numw:%ld,numt:%ld,numwg:%ld,kernelx:%ld,kernely:%ld,kernelz:%ld,ldssize:0x%lx,pdssize:0x%lx,pdsbase:0x%lx,knlbase:0x%lx,currwgid:%lx",\
num_warp,num_thread,num_workgroup,num_workgroup_x,num_workgroup_y,num_workgroup_z,ldssize,pdssize,pdsbase,knlbase,currwgid);
printf("arg gpgpu is %s\n",arg_gpgpu);
sprintf(arg_vlen_elen,"vlen:%ld,elen:%d",num_thread*32,32);
sprintf(arg_mem_scope,"-m0x70000000:0x%lx",buffer.back().base+buffer.back().size);
Expand Down Expand Up @@ -607,54 +609,68 @@ int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){
// we've only set the number of harts, not explicitly chosen their IDs).
std::vector<int> default_hartids;
default_hartids.reserve(nprocs());
for (size_t i = 0; i < nprocs(); ++i) {
for (size_t i = 0; i < num_warp * SPIKE_RUN_WG_NUM; ++i) {
default_hartids.push_back(i);
}
cfg.hartids = default_hartids;
}

std::vector<std::pair<reg_t, mem_t*>> all_buffer_data(const_buffer_data);
for(auto ele : buffer_data) all_buffer_data.push_back(ele);
sim=new sim_t(&cfg, halted,
all_buffer_data, plugin_devices, htif_args, dm_config, log_path, dtb_enabled, dtb_file,cmd_file);
std::unique_ptr<remote_bitbang_t> remote_bitbang((remote_bitbang_t *) NULL);
/*std::unique_ptr<jtag_dtm_t> jtag_dtm(new jtag_dtm_t(sim->debug_module, dmi_rti));
if (use_rbb) {
remote_bitbang.reset(new remote_bitbang_t(rbb_port, &(*jtag_dtm)));
sim->set_remote_bitbang(&(*remote_bitbang));
}*/
for(auto ele : buffer_data) {
all_buffer_data.push_back(ele);
}

if (dump_dts) {
printf("%s", sim->get_dts());
return 0;
}

if (ic && l2) ic->set_miss_handler(&*l2);
if (dc && l2) dc->set_miss_handler(&*l2);
if (ic) ic->set_log(log_cache);
if (dc) dc->set_log(log_cache);
for (size_t i = 0; i < cfg.nprocs(); i++)
auto return_code = 0;
// char log_name[256] = {0};
for (uint64_t i = 0; i < num_workgroup / SPIKE_RUN_WG_NUM; i++)
{
if (ic) sim->get_core(i)->get_mmu()->register_memtracer(&*ic);
if (dc) sim->get_core(i)->get_mmu()->register_memtracer(&*dc);
for (auto e : extensions)
sim->get_core(i)->register_extension(e());
sim->get_core(i)->get_mmu()->set_cache_blocksz(blocksz);
sim=new sim_t(&cfg, halted,
all_buffer_data, plugin_devices, htif_args, dm_config, log_path, dtb_enabled, dtb_file,cmd_file);
std::unique_ptr<remote_bitbang_t> remote_bitbang((remote_bitbang_t *) NULL);
/*std::unique_ptr<jtag_dtm_t> jtag_dtm(new jtag_dtm_t(sim->debug_module, dmi_rti));
if (use_rbb) {
remote_bitbang.reset(new remote_bitbang_t(rbb_port, &(*jtag_dtm)));
sim->set_remote_bitbang(&(*remote_bitbang));
}*/

if (dump_dts) {
printf("%s", sim->get_dts());
return 0;
}

if (ic && l2) ic->set_miss_handler(&*l2);
if (dc && l2) dc->set_miss_handler(&*l2);
if (ic) ic->set_log(log_cache);
if (dc) dc->set_log(log_cache);

for (size_t i = 0; i < num_warp; i++)
{
if (ic) sim->get_core(i)->get_mmu()->register_memtracer(&*ic);
if (dc) sim->get_core(i)->get_mmu()->register_memtracer(&*dc);
for (auto e : extensions)
sim->get_core(i)->register_extension(e());
sim->get_core(i)->get_mmu()->set_cache_blocksz(blocksz);
}

sim->set_debug(debug);
sim->configure_log(log, log_commits);
sim->set_histogram(histogram);

return_code = sim->run();
currwgid++;
sprintf(arg_gpgpu,"numw:%ld,numt:%ld,numwg:%ld,kernelx:%ld,kernely:%ld,kernelz:%ld,ldssize:0x%lx,pdssize:0x%lx,pdsbase:0x%lx,knlbase:0x%lx,currwgid:%lx",\
num_warp,num_thread,num_workgroup,num_workgroup_x,num_workgroup_y,num_workgroup_z,ldssize,pdssize,pdsbase,knlbase,currwgid);
// sprintf(log_name, "object_%ld.riscv.log", currwgid);
// log_path = log_name;

delete sim;
}

sim->set_debug(debug);
sim->configure_log(log, log_commits);
sim->set_histogram(histogram);

auto return_code = sim->run();

//for (auto& mem : buffer_data)
//delete mem.second;

for (auto& plugin_device : plugin_devices)
delete plugin_device.second;
delete[] argv;
delete sim;
// delete sim;
return return_code;
}

0 comments on commit c5aa3eb

Please sign in to comment.