accel-sim · JRPan · Oct 6, 2021 · Oct 13, 2021 · Oct 13, 2021 · Dec 10, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -10,6 +10,7 @@ project(GPGPU-Sim
 # Specify the C++ standard
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
+string(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 
 # GPGPU-Sim build option
 option(GPGPUSIM_ENABLE_TRACE "Whether to enable GPGPU-Sim debug tracing" ON)

diff --git a/aerialvision/lexyacc.py b/aerialvision/lexyacc.py
@@ -160,6 +160,7 @@ def t_error(t):
         'L2WriteMiss':vc.variable('ltwowritemiss',   1, 0, 'scalar'),
         'L2WriteHit' :vc.variable('ltwowritehit',    1, 0, 'scalar'),
         'L2ReadHit'  :vc.variable('ltworeadhit',     1, 0, 'scalar'),
+        'gpu_compute_issued':vc.variable('', 1, 0, 'scalar'), 
         'globalTotInsn':vc.variable('globaltotinsncount', 1,0, 'scalar'), 
         'dramCMD' :vc.variable('', 2, 0, 'idxVec'),
         'dramNOP' :vc.variable('', 2, 0, 'idxVec'),
@@ -173,11 +174,15 @@ def t_error(t):
         'globalCompletedThreads':vc.variable('gpucompletedthreads', 1, 1, 'scalar'),
         'globalSentWrites':vc.variable('gpgpunsentwrites', 1, 0, 'scalar'), 
         'globalProcessedWrites':vc.variable('gpgpunprocessedwrites', 1, 0, 'scalar'), 
+        'warpslotfilled':vc.variable('warpslotfilled', 1,0, 'scalar'), 
+        'warptotalslot':vc.variable('warptotalslot', 1,0, 'scalar'), 
         'averagemflatency' :vc.variable('', 1, 0, 'custom'), 
         'LDmemlatdist':vc.variable('', 3, 0, 'stackbar'), 
         'STmemlatdist':vc.variable('', 3, 0, 'stackbar'), 
         'WarpDivergenceBreakdown':vc.variable('', 3, 0, 'stackbar'), 
+        'unit_active':vc.variable('unit_active', 2, 0, 'impVec'), 
         'WarpIssueSlotBreakdown':vc.variable('', 3, 0, 'stackbar'), 
+        'L2Breakdown':vc.variable('', 3, 0, 'stackbar'), 
         'WarpIssueDynamicIdBreakdown':vc.variable('', 3, 0, 'stackbar'), 
         'dram_writes_per_cycle':vc.variable('', 1, 0, 'scalar', float),
         'dram_reads_per_cycle' :vc.variable('', 1, 0, 'scalar', float),

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -97,7 +97,7 @@
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_flush_l1_cache 1
 -gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 20
 
 # shared memory  configuration
 -gpgpu_shmem_size 65536

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -150,7 +150,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 20
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_flush_l1_cache 1

diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -97,7 +97,7 @@
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_flush_l1_cache 1
 -gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 20
 
 # shared memory  configuration
 -gpgpu_shmem_size 102400

diff --git a/configs/tested-cfgs/SM87_ORIN/gpgpusim.config b/configs/tested-cfgs/SM87_ORIN/gpgpusim.config
@@ -0,0 +1,178 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 87
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency  5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 8
+-gpgpu_compute_capability_minor 7
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 16
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1300:1300:1300:1300
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 87
+
+-gpgpu_shader_core_pipeline 1536:32
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 1,1,1,1,2
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 25
+-ptx_opcode_initiation_tensor 25
+
+# sub core model: in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# register banks
+-gpgpu_num_reg_banks 8
+-gpgpu_reg_file_port_throughput 2
+
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,100,132,164
+-gpgpu_unified_l1d_size 192
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1 S:4:128:384,L:T:m:L:L,A:384:48,16:0,32
+-gpgpu_l1_latency 38
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 20
+
+# shared memory  configuration
+-gpgpu_shmem_size 167936
+-gpgpu_shmem_sizeDefault 167936
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 29
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 87
+
+# L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:X,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 146
+-dram_latency 367
+
+# dram sched config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# dram model config
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 32
+-dram_data_command_freq_ratio 2
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=16:RRD=5:RCD=9:RAS=21:RP=9:RC=29:CL=9:WL=3:CDLR=4:WR=9:nbkgrp=4:CCDL=3:RTPL=2
+-dram_dual_bus_interface 0
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
diff --git a/format-code.sh b/format-code.sh
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
@@ -124,7 +124,7 @@
 
 #define __CUDA_RUNTIME_API_H__
 // clang-format off
-#include "host_defines.h"
+#include "cuda_runtime_api.h"
 #include "builtin_types.h"
 #include "driver_types.h"
 #include "cuda_api.h"

diff --git a/run-clang-format.py b/run-clang-format.py
diff --git a/short-tests.sh b/short-tests.sh
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
@@ -208,6 +208,12 @@ gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
 
   gpu_sim_cycle = 0;
   gpu_tot_sim_cycle = 0;
+  gpu_render_start_cycle = 0;
+  gpu_compute_start_cycle = -1;
+  gpu_last_frame_cycle = 0;
+  gpu_compute_end_cycle = 0;
+  gpu_last_compute_cycle = 0;
+  gpu_compute_issued = 0;
 }
 
 new_addr_type line_size_based_tag_func(new_addr_type address,
@@ -310,6 +316,9 @@ void warp_inst_t::generate_mem_accesses() {
       break;
     case global_space:
       access_type = is_write ? GLOBAL_ACC_W : GLOBAL_ACC_R;
+      if (mem_op == TEX) {
+        access_type = TEXTURE_ACC_R;
+      }
       break;
     case local_space:
     case param_space_local:
@@ -459,7 +468,17 @@ void warp_inst_t::generate_mem_accesses() {
           line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
-      for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
+      for (unsigned i = 0; i < data_size; i++) {
+        if (idx + i < cache_block_size) {
+          byte_mask.set(idx + i);
+        } else {
+          unsigned block_address = line_size_based_tag_func(
+              addr + cache_block_size, cache_block_size);
+          accesses[block_address].set(thread);
+          byte_mask.set(idx + i - cache_block_size);
+          break;
+        }
+      }
     }
     for (a = accesses.begin(); a != accesses.end(); ++a)
       m_accessq.push_back(mem_access_t(
@@ -1234,17 +1253,12 @@ warp_inst_t core_t::getExecuteWarp(unsigned warpId) {
 }
 
 void core_t::deleteSIMTStack() {
-  if (m_simt_stack) {
-    for (unsigned i = 0; i < m_warp_count; ++i) delete m_simt_stack[i];
-    delete[] m_simt_stack;
-    m_simt_stack = NULL;
-  }
+  for (unsigned i = 0; i < m_simt_stack.size(); ++i) delete m_simt_stack[i];
 }
 
 void core_t::initilizeSIMTStack(unsigned warp_count, unsigned warp_size) {
-  m_simt_stack = new simt_stack *[warp_count];
   for (unsigned i = 0; i < warp_count; ++i)
-    m_simt_stack[i] = new simt_stack(i, warp_size, m_gpu);
+    m_simt_stack.push_back(new simt_stack(i, warp_size, m_gpu));
   m_warp_size = warp_size;
   m_warp_count = warp_count;
 }
@@ -1253,3 +1267,15 @@ void core_t::get_pdom_stack_top_info(unsigned warpId, unsigned *pc,
                                      unsigned *rpc) const {
   m_simt_stack[warpId]->get_pdom_stack_top_info(pc, rpc);
 }
+
+void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) {
+  i.x++;
+  if (i.x >= bound.x) {
+    i.x = 0;
+    i.y++;
+    if (i.y >= bound.y) {
+      i.y = 0;
+      if (i.z < bound.z) i.z++;
+    }
+  }
+}