From 06bd5baa1bd871da78883e20ef085f975d20ef52 Mon Sep 17 00:00:00 2001 From: lifeiabc Date: Fri, 24 Nov 2023 14:09:34 +0800 Subject: [PATCH] Run workgroups one by one - Run workgroups one by one, to avoid allocating memory for all warps one time - The private memory is temporarily specified to 256MB, 2048 warps in one workgroup can be supported - Remove make dts files - Core number in logfile is different, search 'endprg' to distinguish different workgroups - Remove dup stdin, stdout, stderr to limit max file descriptor - Should limit the pdssize to 256MB in POCL at the same time --- fesvr/syscall.cc | 2 + riscv/insns/endprg.h | 2 +- riscv/log_file.h | 2 +- riscv/processor.cc | 10 ++++- riscv/processor.h | 2 +- riscv/sim.cc | 53 +++++++++++++++------- spike_main/spike_device.cc | 92 ++++++++++++++++++++++---------------- 7 files changed, 103 insertions(+), 60 deletions(-) diff --git a/fesvr/syscall.cc b/fesvr/syscall.cc index ab7fc3b4..04fd6bce 100644 --- a/fesvr/syscall.cc +++ b/fesvr/syscall.cc @@ -165,6 +165,7 @@ syscall_t::syscall_t(htif_t* htif) register_command(0, std::bind(&syscall_t::handle_syscall, this, _1), "syscall"); +#if 0 int stdin_fd = dup(0), stdout_fd0 = dup(1), stdout_fd1 = dup(1); if (stdin_fd < 0 || stdout_fd0 < 0 || stdout_fd1 < 0) throw std::runtime_error("could not dup stdin/stdout"); @@ -172,6 +173,7 @@ syscall_t::syscall_t(htif_t* htif) fds.alloc(stdin_fd); // stdin -> stdin fds.alloc(stdout_fd0); // stdout -> stdout fds.alloc(stdout_fd1); // stderr -> stdout +#endif } std::string syscall_t::do_chroot(const char* fn) diff --git a/riscv/insns/endprg.h b/riscv/insns/endprg.h index dd273e5f..f989720e 100644 --- a/riscv/insns/endprg.h +++ b/riscv/insns/endprg.h @@ -1,7 +1,7 @@ p->get_sim()->modify_reach_end(); p->gpgpu_unit.w->set_barrier_2(p->get_csr(CSR_WID)); if(p->get_sim()->get_reach_end()){ - std::cout<<"all warps reach the endprg. now proc 0 will end the simulation."<get_sim()->append_reach_end(); //return 0; } diff --git a/riscv/log_file.h b/riscv/log_file.h index d039859d..c3c6afc2 100644 --- a/riscv/log_file.h +++ b/riscv/log_file.h @@ -19,7 +19,7 @@ class log_file_t if (!path) return; - wrapped_file.reset(fopen(path, "w")); + wrapped_file.reset(fopen(path, "a")); if (! wrapped_file) { std::ostringstream oss; oss << "Failed to open log file at `" << path << "': " diff --git a/riscv/processor.cc b/riscv/processor.cc index 01325bc3..4ee63c22 100644 --- a/riscv/processor.cc +++ b/riscv/processor.cc @@ -1285,6 +1285,7 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s) uint64_t pdssize=0; uint64_t pdsbase=0x78000000; uint64_t knlbase=0x80000000; + uint64_t currwgid=0; size_t kernel_size[3]={0,1,1}; while (pos < len) { @@ -1314,6 +1315,8 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s) pdsbase = get_long_token(str,',',pos); else if (attr == "knlbase") knlbase = get_long_token(str,',',pos); + else if (attr == "currwgid") + currwgid = get_long_token(str,',',pos); else bad_gpgpuarch_string(s, "Unsupported token"); ++pos; @@ -1332,6 +1335,7 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s) pds_size = pdssize == 0 ? (numw * numt )<< 10 : pdssize; pds_base = pdsbase; knl_base = knlbase; + curr_wgid = currwgid; kernel_size[0] = kernel_size[0]==0 ? (numwg/(kernel_size[1]*kernel_size[2])) : kernel_size[0]; workgroup_size_x=kernel_size[0]; @@ -1342,7 +1346,9 @@ void warp_schedule_t::parse_gpgpuarch_string(const char *s) bad_gpgpuarch_string(s, "kernel size doesn't match total wg size"); } - std::cout << std::dec<<"warp number: " << warp_number << " thread number = " << thread_number << " workgroup number = "<< workgroup_number \ + /* + std::cout << "--- warp number: " << warp_number << " thread number = " << thread_number << " workgroup number = "<< workgroup_number \ <<" workgroup dimension:"< barriers; bool is_all_true; int barrier_counter; - uint64_t lds_base,lds_size,pds_base,pds_size,knl_base; + uint64_t lds_base,lds_size,pds_base,pds_size,knl_base, curr_wgid; }; struct insn_desc_t //mask diff --git a/riscv/sim.cc b/riscv/sim.cc index d42a223d..a253eac4 100644 --- a/riscv/sim.cc +++ b/riscv/sim.cc @@ -102,14 +102,24 @@ sim_t::sim_t(const cfg_t *cfg, bool halted, uint64_t pds_size=w.pds_size; uint64_t lds_size=w.lds_size; - uint64_t pds=pds_base; - uint64_t lds=lds_base; + uint64_t pds = pds_base; + uint64_t lds = lds_base; + + w.workgroup_number = 1; + + uint64_t spike_curr_wgid = w.curr_wgid; + + assert(spike_curr_wgid < w.workgroup_size_x * w.workgroup_size_y * w.workgroup_size_z); + gidz = spike_curr_wgid / (w.workgroup_size_x * w.workgroup_size_y); + gidy = (spike_curr_wgid % (w.workgroup_size_x * w.workgroup_size_y)) / w.workgroup_size_x; + gidx = (spike_curr_wgid % (w.workgroup_size_x * w.workgroup_size_y)) % w.workgroup_size_x; + //printf("simt() current_wgid is %ld, gidx %ld, gidy %ld, gidz %ld\n", spike_curr_wgid, gidx, gidy, gidz); workgroups = new warp_schedule_t[w.workgroup_number]; - for (size_t i=0;i0 & w.workgroup_number>0 & w.thread_number>0); assert(w.warp_number * w.workgroup_number == cfg->nprocs()); - workgroups[i].set_warp_schedule(w.warp_number,w.thread_number,w.workgroup_number,i); + workgroups[i].set_warp_schedule(w.warp_number,w.thread_number,w.workgroup_number, spike_curr_wgid); for (size_t j = 0; j < w.warp_number; j++) { reach_end[i*w.warp_number+j] = i*w.warp_number+j; @@ -118,26 +128,33 @@ sim_t::sim_t(const cfg_t *cfg, bool halted, procs[i*w.warp_number+j]->gpgpu_unit.set_warp(&workgroups[i]);//workgroups[i]); //现在一个warp就是一个core - procs[i*w.warp_number+j]->gpgpu_unit.init_warp(w.warp_number, w.thread_number, j*w.thread_number, i,j, pds, lds, knl_base, gidx,gidy,gidz); + procs[i*w.warp_number+j]->gpgpu_unit.init_warp(w.warp_number, w.thread_number, + j * w.thread_number, spike_curr_wgid, j, pds, lds, knl_base, gidx, gidy, gidz); assert(w.thread_number == (procs[i]->VU.get_vlen() / procs[i]->VU.get_elen())); } - gidx=gidx+1; - if(gidx==w.workgroup_size_x){ - gidx=0; - gidy=gidy+1; - if(gidy==w.workgroup_size_y){ - gidy=0; - gidz=gidz+1; - if(gidz==w.workgroup_size_z){gidz=0;} - } +#if 0 + gidx = gidx+1; + if(gidx == w.workgroup_size_x) { + gidx = 0; + gidy = gidy + 1; + + if(gidy == w.workgroup_size_y) { + gidy = 0; + gidz = gidz+1; + + if(gidz == w.workgroup_size_z) { + gidz = 0; + } + } } - pds=pds+pds_size; - lds=lds+lds_size; +#endif + pds = pds + pds_size; + lds = lds + lds_size; } - +#if 0 make_dtb(); void *fdt = (void *)dtb.c_str(); @@ -224,6 +241,8 @@ sim_t::sim_t(const cfg_t *cfg, bool halted, << nprocs() << ").\n"; exit(1); } +#endif + } sim_t::~sim_t() diff --git a/spike_main/spike_device.cc b/spike_main/spike_device.cc index 68cb0701..7b1b2aaf 100644 --- a/spike_main/spike_device.cc +++ b/spike_main/spike_device.cc @@ -344,7 +344,7 @@ int spike_device::set_filename(const char* filename,const char* logname){ return 0; } - +#define SPIKE_RUN_WG_NUM 1 int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){ uint64_t num_warp=knl_data->wg_size; uint64_t num_thread=knl_data->wf_size; @@ -354,10 +354,12 @@ int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){ uint64_t num_workgroup=num_workgroup_x*num_workgroup_y*num_workgroup_z; uint64_t num_processor=num_warp*num_workgroup; uint64_t ldssize=knl_data->ldsSize; - uint64_t pdssize=knl_data->pdsSize*num_thread; + //uint64_t pdssize=knl_data->pdsSize * num_thread; + uint64_t pdssize = 0x10000000; uint64_t pdsbase=knl_data->pdsBaseAddr; uint64_t start_pc=knl_start_pc; uint64_t knlbase=knl_data->metaDataBaseAddr; + uint64_t currwgid = 0; if ((ldssize)>0x10000000) { fprintf(stderr, "lds size is too large. please modify VBASEADDR"); @@ -545,8 +547,8 @@ int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){ char arg_logfilename[64]; sprintf(arg_logfilename,"--log=%s",logfilename); sprintf(arg_num_core,"-p%ld",num_processor); - sprintf(arg_gpgpu,"numw:%ld,numt:%ld,numwg:%ld,kernelx:%ld,kernely:%ld,kernelz:%ld,ldssize:0x%lx,pdssize:0x%lx,pdsbase:0x%lx,knlbase:0x%lx",\ - num_warp,num_thread,num_workgroup,num_workgroup_x,num_workgroup_y,num_workgroup_z,ldssize,pdssize,pdsbase,knlbase); + sprintf(arg_gpgpu,"numw:%ld,numt:%ld,numwg:%ld,kernelx:%ld,kernely:%ld,kernelz:%ld,ldssize:0x%lx,pdssize:0x%lx,pdsbase:0x%lx,knlbase:0x%lx,currwgid:%lx",\ + num_warp,num_thread,num_workgroup,num_workgroup_x,num_workgroup_y,num_workgroup_z,ldssize,pdssize,pdsbase,knlbase,currwgid); printf("arg gpgpu is %s\n",arg_gpgpu); sprintf(arg_vlen_elen,"vlen:%ld,elen:%d",num_thread*32,32); sprintf(arg_mem_scope,"-m0x70000000:0x%lx",buffer.back().base+buffer.back().size); @@ -607,54 +609,68 @@ int spike_device::run(meta_data* knl_data,uint64_t knl_start_pc){ // we've only set the number of harts, not explicitly chosen their IDs). std::vector default_hartids; default_hartids.reserve(nprocs()); - for (size_t i = 0; i < nprocs(); ++i) { + for (size_t i = 0; i < num_warp * SPIKE_RUN_WG_NUM; ++i) { default_hartids.push_back(i); } cfg.hartids = default_hartids; } std::vector> all_buffer_data(const_buffer_data); - for(auto ele : buffer_data) all_buffer_data.push_back(ele); - sim=new sim_t(&cfg, halted, - all_buffer_data, plugin_devices, htif_args, dm_config, log_path, dtb_enabled, dtb_file,cmd_file); - std::unique_ptr remote_bitbang((remote_bitbang_t *) NULL); - /*std::unique_ptr jtag_dtm(new jtag_dtm_t(sim->debug_module, dmi_rti)); - if (use_rbb) { - remote_bitbang.reset(new remote_bitbang_t(rbb_port, &(*jtag_dtm))); - sim->set_remote_bitbang(&(*remote_bitbang)); - }*/ + for(auto ele : buffer_data) { + all_buffer_data.push_back(ele); + } - if (dump_dts) { - printf("%s", sim->get_dts()); - return 0; - } - if (ic && l2) ic->set_miss_handler(&*l2); - if (dc && l2) dc->set_miss_handler(&*l2); - if (ic) ic->set_log(log_cache); - if (dc) dc->set_log(log_cache); - for (size_t i = 0; i < cfg.nprocs(); i++) + auto return_code = 0; +// char log_name[256] = {0}; + for (uint64_t i = 0; i < num_workgroup / SPIKE_RUN_WG_NUM; i++) { - if (ic) sim->get_core(i)->get_mmu()->register_memtracer(&*ic); - if (dc) sim->get_core(i)->get_mmu()->register_memtracer(&*dc); - for (auto e : extensions) - sim->get_core(i)->register_extension(e()); - sim->get_core(i)->get_mmu()->set_cache_blocksz(blocksz); + sim=new sim_t(&cfg, halted, + all_buffer_data, plugin_devices, htif_args, dm_config, log_path, dtb_enabled, dtb_file,cmd_file); + std::unique_ptr remote_bitbang((remote_bitbang_t *) NULL); + /*std::unique_ptr jtag_dtm(new jtag_dtm_t(sim->debug_module, dmi_rti)); + if (use_rbb) { + remote_bitbang.reset(new remote_bitbang_t(rbb_port, &(*jtag_dtm))); + sim->set_remote_bitbang(&(*remote_bitbang)); + }*/ + + if (dump_dts) { + printf("%s", sim->get_dts()); + return 0; + } + + if (ic && l2) ic->set_miss_handler(&*l2); + if (dc && l2) dc->set_miss_handler(&*l2); + if (ic) ic->set_log(log_cache); + if (dc) dc->set_log(log_cache); + + for (size_t i = 0; i < num_warp; i++) + { + if (ic) sim->get_core(i)->get_mmu()->register_memtracer(&*ic); + if (dc) sim->get_core(i)->get_mmu()->register_memtracer(&*dc); + for (auto e : extensions) + sim->get_core(i)->register_extension(e()); + sim->get_core(i)->get_mmu()->set_cache_blocksz(blocksz); + } + + sim->set_debug(debug); + sim->configure_log(log, log_commits); + sim->set_histogram(histogram); + + return_code = sim->run(); + currwgid++; + sprintf(arg_gpgpu,"numw:%ld,numt:%ld,numwg:%ld,kernelx:%ld,kernely:%ld,kernelz:%ld,ldssize:0x%lx,pdssize:0x%lx,pdsbase:0x%lx,knlbase:0x%lx,currwgid:%lx",\ + num_warp,num_thread,num_workgroup,num_workgroup_x,num_workgroup_y,num_workgroup_z,ldssize,pdssize,pdsbase,knlbase,currwgid); + // sprintf(log_name, "object_%ld.riscv.log", currwgid); + // log_path = log_name; + + delete sim; } - sim->set_debug(debug); - sim->configure_log(log, log_commits); - sim->set_histogram(histogram); - - auto return_code = sim->run(); - - //for (auto& mem : buffer_data) - //delete mem.second; - for (auto& plugin_device : plugin_devices) delete plugin_device.second; delete[] argv; - delete sim; +// delete sim; return return_code; }