Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CRISP Intergration #77

Closed
wants to merge 43 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
7a3fd9a
l1d write ratio 25 -> 20
JRPan Oct 6, 2021
c52a29e
my jenkins now
JRPan Oct 13, 2021
72e9807
clean before make
JRPan Oct 13, 2021
db342c8
Create main.yml
JRPan Dec 10, 2021
f9195b9
add more job
JRPan Dec 10, 2021
67630c2
update yaml
JRPan Dec 10, 2021
6547cf6
Apply Github Action to all branches
JRPan Dec 11, 2021
58074c4
adding occupancy to aerialvision
JRPan Jun 28, 2022
314df10
add jetson orin config
JRPan Aug 4, 2022
f98089d
correct Orin cache configs based on whitepaper
JRPan Aug 5, 2022
708e053
update Orin shmem config
JRPan Aug 5, 2022
d2b3177
update Orin config
JRPan Aug 10, 2022
b57f345
update orin config
JRPan Sep 6, 2022
ad3558d
Merge branch 'dev' into mydev
JRPan Sep 6, 2022
90a710f
WIP multi-kernel stats, power model no usable, no PTX
JRPan Jan 16, 2023
c5566e1
WIP: bug fix at clear stats
JRPan Jan 19, 2023
9c25b4a
move increment_x_then_y_then_z
JRPan Jan 19, 2023
bc5789c
Merge branch 'mydev' into dev-multi-kernel-stats
JRPan Jan 19, 2023
4f10351
WIP: multi-kernel stats update ccyles, insts counts etc to per kernel
JRPan Jan 20, 2023
99a3187
multi-kernel stats
JRPan Jan 23, 2023
b64773e
limited concurrent + multi kernel stats update
JRPan Feb 6, 2023
20eadb4
New option for SM C/G
JRPan Mar 22, 2023
456bd3c
perfect l2, finegrained schduler, mig style, move TEX cache to L1D
JRPan Apr 4, 2023
e56ac3f
naive dynamic FG
JRPan Apr 12, 2023
95b05c9
dynamic concurent
JRPan Apr 26, 2023
97f6456
revert seperating tex and vertex in cache
JRPan May 10, 2023
3f94337
adding the best scheduler
JRPan Oct 25, 2023
2733d81
added warp slicer. Need to fix partition to check for limiting factor
JRPan Dec 11, 2023
fa69bd0
Allow TB to exceed partition
JRPan Mar 7, 2024
0c0ecdd
artifacts
JRPan Jul 23, 2024
376e1b9
Merge branch 'dev-vulkan' into crisp
JRPan Sep 25, 2024
5a0e24b
revert gitingore changes
JRPan Sep 26, 2024
1de0f48
merge clean
JRPan Sep 30, 2024
d8b6a10
unsigned long long stream id
JRPan Sep 30, 2024
320f72c
Automated Format
purdue-jenkins Oct 1, 2024
05fd91f
more cleanup
JRPan Oct 7, 2024
f08f1b6
factoring cache utility
JRPan Oct 12, 2024
485f0b0
sync, cleanup: compute runs
JRPan Oct 21, 2024
53cf9ad
remove is_grpahics
JRPan Oct 23, 2024
296ec52
Merge remote-tracking branch 'upstream/dev' into crisp
JRPan Oct 23, 2024
591f477
add cycle collect scope
JRPan Oct 24, 2024
5cc51ad
remove is_graphics_kernel from kernel_info
JRPan Oct 24, 2024
1c54d70
fix kernel name
JRPan Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ project(GPGPU-Sim
# Specify the C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)
string(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")

# GPGPU-Sim build option
option(GPGPUSIM_ENABLE_TRACE "Whether to enable GPGPU-Sim debug tracing" ON)
Expand Down
5 changes: 5 additions & 0 deletions aerialvision/lexyacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def t_error(t):
'L2WriteMiss':vc.variable('ltwowritemiss', 1, 0, 'scalar'),
'L2WriteHit' :vc.variable('ltwowritehit', 1, 0, 'scalar'),
'L2ReadHit' :vc.variable('ltworeadhit', 1, 0, 'scalar'),
'gpu_compute_issued':vc.variable('', 1, 0, 'scalar'),
'globalTotInsn':vc.variable('globaltotinsncount', 1,0, 'scalar'),
'dramCMD' :vc.variable('', 2, 0, 'idxVec'),
'dramNOP' :vc.variable('', 2, 0, 'idxVec'),
Expand All @@ -173,11 +174,15 @@ def t_error(t):
'globalCompletedThreads':vc.variable('gpucompletedthreads', 1, 1, 'scalar'),
'globalSentWrites':vc.variable('gpgpunsentwrites', 1, 0, 'scalar'),
'globalProcessedWrites':vc.variable('gpgpunprocessedwrites', 1, 0, 'scalar'),
'warpslotfilled':vc.variable('warpslotfilled', 1,0, 'scalar'),
'warptotalslot':vc.variable('warptotalslot', 1,0, 'scalar'),
'averagemflatency' :vc.variable('', 1, 0, 'custom'),
'LDmemlatdist':vc.variable('', 3, 0, 'stackbar'),
'STmemlatdist':vc.variable('', 3, 0, 'stackbar'),
'WarpDivergenceBreakdown':vc.variable('', 3, 0, 'stackbar'),
'unit_active':vc.variable('unit_active', 2, 0, 'impVec'),
'WarpIssueSlotBreakdown':vc.variable('', 3, 0, 'stackbar'),
'L2Breakdown':vc.variable('', 3, 0, 'stackbar'),
'WarpIssueDynamicIdBreakdown':vc.variable('', 3, 0, 'stackbar'),
'dram_writes_per_cycle':vc.variable('', 1, 0, 'scalar', float),
'dram_reads_per_cycle' :vc.variable('', 1, 0, 'scalar', float),
Expand Down
2 changes: 1 addition & 1 deletion configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
-gpgpu_gmem_skip_L1D 0
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
-gpgpu_l1_cache_write_ratio 25
-gpgpu_l1_cache_write_ratio 20

# shared memory configuration
-gpgpu_shmem_size 65536
Expand Down
2 changes: 1 addition & 1 deletion configs/tested-cfgs/SM7_QV100/gpgpusim.config
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@
# L1 cache configuration
-gpgpu_l1_banks 4
-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
-gpgpu_l1_cache_write_ratio 25
-gpgpu_l1_cache_write_ratio 20
-gpgpu_l1_latency 20
-gpgpu_gmem_skip_L1D 0
-gpgpu_flush_l1_cache 1
Expand Down
2 changes: 1 addition & 1 deletion configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
-gpgpu_gmem_skip_L1D 0
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
-gpgpu_l1_cache_write_ratio 25
-gpgpu_l1_cache_write_ratio 20

# shared memory configuration
-gpgpu_shmem_size 102400
Expand Down
178 changes: 178 additions & 0 deletions configs/tested-cfgs/SM87_ORIN/gpgpusim.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 87

# Device Limits
-gpgpu_stack_size_limit 1024
-gpgpu_heap_size_limit 8388608
-gpgpu_runtime_sync_depth_limit 2
-gpgpu_runtime_pending_launch_count_limit 2048
-gpgpu_kernel_launch_latency 5000
-gpgpu_TB_launch_latency 0

# Compute Capability
-gpgpu_compute_capability_major 8
-gpgpu_compute_capability_minor 7

# PTX execution-driven
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0

# high level architecture configuration
-gpgpu_n_clusters 16
-gpgpu_n_cores_per_cluster 1
-gpgpu_n_mem 16
-gpgpu_n_sub_partition_per_mchannel 2

# clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
-gpgpu_clock_domains 1300:1300:1300:1300

# shader core pipeline config
-gpgpu_shader_registers 65536
-gpgpu_registers_per_block 65536
-gpgpu_occupancy_sm_number 87

-gpgpu_shader_core_pipeline 1536:32
-gpgpu_shader_cta 32
-gpgpu_simd_model 1

# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
-gpgpu_num_sp_units 4
-gpgpu_num_sfu_units 4
-gpgpu_num_dp_units 4
-gpgpu_num_int_units 4
-gpgpu_tensor_core_avail 1
-gpgpu_num_tensor_core_units 4

# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
# All Div operations are executed on SFU unit
-ptx_opcode_latency_int 4,4,4,4,21
-ptx_opcode_initiation_int 2,2,2,2,2
-ptx_opcode_latency_fp 4,4,4,4,39
-ptx_opcode_initiation_fp 1,1,1,1,2
-ptx_opcode_latency_dp 64,64,64,64,330
-ptx_opcode_initiation_dp 64,64,64,64,130
-ptx_opcode_latency_sfu 21
-ptx_opcode_initiation_sfu 8
-ptx_opcode_latency_tesnor 25
-ptx_opcode_initiation_tensor 25

# sub core model: in which each scheduler has its own register file and EUs
# i.e. schedulers are isolated
-gpgpu_sub_core_model 1
# disable specialized operand collectors and use generic operand collectors instead
-gpgpu_enable_specialized_operand_collector 0
-gpgpu_operand_collector_num_units_gen 8
-gpgpu_operand_collector_num_in_ports_gen 8
-gpgpu_operand_collector_num_out_ports_gen 8
# register banks
-gpgpu_num_reg_banks 8
-gpgpu_reg_file_port_throughput 2

# warp scheduling
-gpgpu_num_sched_per_core 4
-gpgpu_scheduler lrr
# a warp scheduler issue mode
-gpgpu_max_insn_issue_per_warp 1
-gpgpu_dual_issue_diff_exec_units 1

## L1/shared memory configuration
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
# ** Optional parameter - Required when mshr_type==Texture Fifo
# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-gpgpu_adaptive_cache_config 1
-gpgpu_shmem_option 0,8,16,32,64,100,132,164
-gpgpu_unified_l1d_size 192
# L1 cache configuration
-gpgpu_l1_banks 4
-gpgpu_cache:dl1 S:4:128:384,L:T:m:L:L,A:384:48,16:0,32
-gpgpu_l1_latency 38
-gpgpu_gmem_skip_L1D 0
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
-gpgpu_l1_cache_write_ratio 20

# shared memory configuration
-gpgpu_shmem_size 167936
-gpgpu_shmem_sizeDefault 167936
-gpgpu_shmem_per_block 49152
-gpgpu_smem_latency 29
# shared memory bankconflict detection
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_coalesce_arch 87

# L2 cache
-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:X,A:192:4,32:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_dram_partition_queues 64:64:64:64
-gpgpu_perf_sim_memcpy 1
-gpgpu_memory_partition_indexing 2

# 128 KB Inst.
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
-gpgpu_inst_fetch_throughput 4
# 128 KB Tex
# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
# 64 KB Const
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
-gpgpu_perfect_inst_const_cache 1

# interconnection
# use built-in local xbar
-network_mode 2
-icnt_in_buffer_limit 512
-icnt_out_buffer_limit 512
-icnt_subnets 2
-icnt_flit_size 40
-icnt_arbiter_algo 1

# memory partition latency config
-gpgpu_l2_rop_latency 146
-dram_latency 367

# dram sched config
-gpgpu_dram_scheduler 1
-gpgpu_frfcfs_dram_sched_queue_size 64
-gpgpu_dram_return_queue_size 192

# dram model config
-gpgpu_n_mem_per_ctrlr 1
-gpgpu_dram_buswidth 2
-gpgpu_dram_burst_length 32
-dram_data_command_freq_ratio 2
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS

# Mem timing
-gpgpu_dram_timing_opt nbk=16:CCD=16:RRD=5:RCD=9:RAS=21:RP=9:RC=29:CL=9:WL=3:CDLR=4:WR=9:nbkgrp=4:CCDL=3:RTPL=2
-dram_dual_bus_interface 0

# select lower bits for bnkgrp to increase bnkgrp parallelism
-dram_bnk_indexing_policy 0
-dram_bnkgrp_indexing_policy 1

#-dram_seperate_write_queue_enable 1
#-dram_write_queue_size 64:56:32

# stat collection
-gpgpu_memlatency_stat 14
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0

# power model configs, disable it untill we create a real energy model
-power_simulation_enabled 0

# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0
Empty file modified format-code.sh
100755 → 100644
Empty file.
2 changes: 1 addition & 1 deletion libcuda/cuda_runtime_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@

#define __CUDA_RUNTIME_API_H__
// clang-format off
#include "host_defines.h"
#include "cuda_runtime_api.h"
#include "builtin_types.h"
#include "driver_types.h"
#include "cuda_api.h"
Expand Down
Empty file modified run-clang-format.py
100755 → 100644
Empty file.
Empty file modified short-tests.sh
100755 → 100644
Empty file.
42 changes: 34 additions & 8 deletions src/abstract_hardware_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)

gpu_sim_cycle = 0;
gpu_tot_sim_cycle = 0;
gpu_render_start_cycle = 0;
gpu_compute_start_cycle = -1;
gpu_last_frame_cycle = 0;
gpu_compute_end_cycle = 0;
gpu_last_compute_cycle = 0;
gpu_compute_issued = 0;
}

new_addr_type line_size_based_tag_func(new_addr_type address,
Expand Down Expand Up @@ -310,6 +316,9 @@ void warp_inst_t::generate_mem_accesses() {
break;
case global_space:
access_type = is_write ? GLOBAL_ACC_W : GLOBAL_ACC_R;
if (mem_op == TEX) {
access_type = TEXTURE_ACC_R;
}
break;
case local_space:
case param_space_local:
Expand Down Expand Up @@ -459,7 +468,17 @@ void warp_inst_t::generate_mem_accesses() {
line_size_based_tag_func(addr, cache_block_size);
accesses[block_address].set(thread);
unsigned idx = addr - block_address;
for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
for (unsigned i = 0; i < data_size; i++) {
if (idx + i < cache_block_size) {
byte_mask.set(idx + i);
} else {
unsigned block_address = line_size_based_tag_func(
addr + cache_block_size, cache_block_size);
accesses[block_address].set(thread);
byte_mask.set(idx + i - cache_block_size);
break;
}
}
}
for (a = accesses.begin(); a != accesses.end(); ++a)
m_accessq.push_back(mem_access_t(
Expand Down Expand Up @@ -1234,17 +1253,12 @@ warp_inst_t core_t::getExecuteWarp(unsigned warpId) {
}

void core_t::deleteSIMTStack() {
if (m_simt_stack) {
for (unsigned i = 0; i < m_warp_count; ++i) delete m_simt_stack[i];
delete[] m_simt_stack;
m_simt_stack = NULL;
}
for (unsigned i = 0; i < m_simt_stack.size(); ++i) delete m_simt_stack[i];
}

void core_t::initilizeSIMTStack(unsigned warp_count, unsigned warp_size) {
m_simt_stack = new simt_stack *[warp_count];
for (unsigned i = 0; i < warp_count; ++i)
m_simt_stack[i] = new simt_stack(i, warp_size, m_gpu);
m_simt_stack.push_back(new simt_stack(i, warp_size, m_gpu));
m_warp_size = warp_size;
m_warp_count = warp_count;
}
Expand All @@ -1253,3 +1267,15 @@ void core_t::get_pdom_stack_top_info(unsigned warpId, unsigned *pc,
unsigned *rpc) const {
m_simt_stack[warpId]->get_pdom_stack_top_info(pc, rpc);
}

void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) {
i.x++;
if (i.x >= bound.x) {
i.x = 0;
i.y++;
if (i.y >= bound.y) {
i.y = 0;
if (i.z < bound.z) i.z++;
}
}
}
Loading