From 140228df3e714b49ce8cd071ac35ecbd423d19f0 Mon Sep 17 00:00:00 2001 From: christindbose Date: Wed, 23 Oct 2024 22:32:54 -0400 Subject: [PATCH 1/5] hopper initial --- .../SM90_H100/config_hopper_islip.icnt | 74 +++++++ configs/tested-cfgs/SM90_H100/gpgpusim.config | 180 ++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt create mode 100644 configs/tested-cfgs/SM90_H100/gpgpusim.config diff --git a/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt new file mode 100644 index 000000000..6775d5d6f --- /dev/null +++ b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt @@ -0,0 +1,74 @@ +//21*1 fly with 32 flits per packet under gpgpusim injection mode +use_map = 0; +flit_size = 40; + +// currently we do not use this, see subnets below +network_count = 2; + +// Topology +topology = fly; +k = 78; +n = 1; + +// Routing + +routing_function = dest_tag; + + +// Flow control + +num_vcs = 1; +vc_buf_size = 256; +input_buffer_size = 256; +ejection_buffer_size = 256; +boundary_buffer_size = 256; + +wait_for_tail_credit = 0; + +// Router architecture + +vc_allocator = islip; //separable_input_first; +sw_allocator = islip; //separable_input_first; +alloc_iters = 1; + +credit_delay = 0; +routing_delay = 0; +vc_alloc_delay = 1; +sw_alloc_delay = 1; + +input_speedup = 1; +output_speedup = 1; +internal_speedup = 2.0; + +// Traffic, GPGPU-Sim does not use this + +traffic = uniform; +packet_size ={{1,2,3,4},{10,20}}; +packet_size_rate={{1,1,1,1},{2,1}}; + +// Simulation - Don't change + +sim_type = gpgpusim; +//sim_type = latency; +injection_rate = 0.1; + +subnets = 2; + +// Always use read and write no matter following line +//use_read_write = 1; + + +read_request_subnet = 0; +read_reply_subnet = 1; +write_request_subnet = 0; +write_reply_subnet = 1; + +read_request_begin_vc = 0; +read_request_end_vc = 0; +write_request_begin_vc = 0; +write_request_end_vc = 0; +read_reply_begin_vc = 0; +read_reply_end_vc = 0; +write_reply_begin_vc = 0; +write_reply_end_vc = 0; + diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config new file mode 100644 index 000000000..d26b1a621 --- /dev/null +++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config @@ -0,0 +1,180 @@ +# functional simulator specification +-gpgpu_ptx_instruction_classification 0 +-gpgpu_ptx_sim_mode 0 +-gpgpu_ptx_force_max_capability 86 + +# Device Limits +-gpgpu_stack_size_limit 1024 +-gpgpu_heap_size_limit 8388608 +-gpgpu_runtime_sync_depth_limit 2 +-gpgpu_runtime_pending_launch_count_limit 2048 +-gpgpu_kernel_launch_latency 5000 +-gpgpu_TB_launch_latency 0 +-gpgpu_max_concurrent_kernel 128 + +# Compute Capability +-gpgpu_compute_capability_major 8 +-gpgpu_compute_capability_minor 6 + +# PTX execution-driven +-gpgpu_ptx_convert_to_ptxplus 0 +-gpgpu_ptx_save_converted_ptxplus 0 + +# high level architecture configuration +-gpgpu_n_clusters 46 +-gpgpu_n_cores_per_cluster 1 +-gpgpu_n_mem 16 +-gpgpu_n_sub_partition_per_mchannel 2 + +# clock domains +#-gpgpu_clock_domains ::: +-gpgpu_clock_domains 1132:1132:1132:3500.5 + +# shader core pipeline config +-gpgpu_shader_registers 65536 +-gpgpu_registers_per_block 65536 +-gpgpu_occupancy_sm_number 86 + +-gpgpu_shader_core_pipeline 1536:32 +-gpgpu_shader_cta 32 +-gpgpu_simd_model 1 + +# Pipeline widths and number of FUs +# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE +-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4 +-gpgpu_num_sp_units 4 +-gpgpu_num_sfu_units 4 +-gpgpu_num_dp_units 4 +-gpgpu_num_int_units 4 +-gpgpu_tensor_core_avail 1 +-gpgpu_num_tensor_core_units 4 + +# Instruction latencies and initiation intervals +# "ADD,MAX,MUL,MAD,DIV" +# All Div operations are executed on SFU unit +-ptx_opcode_latency_int 4,4,4,4,21 +-ptx_opcode_initiation_int 2,2,2,2,2 +-ptx_opcode_latency_fp 4,4,4,4,39 +-ptx_opcode_initiation_fp 1,1,1,1,2 +-ptx_opcode_latency_dp 64,64,64,64,330 +-ptx_opcode_initiation_dp 64,64,64,64,130 +-ptx_opcode_latency_sfu 21 +-ptx_opcode_initiation_sfu 8 +-ptx_opcode_latency_tesnor 64 +-ptx_opcode_initiation_tensor 64 + +# sub core model: in which each scheduler has its own register file and EUs +# i.e. schedulers are isolated +-gpgpu_sub_core_model 1 +# disable specialized operand collectors and use generic operand collectors instead +-gpgpu_enable_specialized_operand_collector 0 +-gpgpu_operand_collector_num_units_gen 8 +-gpgpu_operand_collector_num_in_ports_gen 8 +-gpgpu_operand_collector_num_out_ports_gen 8 +# register banks +-gpgpu_num_reg_banks 8 +-gpgpu_reg_file_port_throughput 2 + +# warp scheduling +-gpgpu_num_sched_per_core 4 +-gpgpu_scheduler lrr +# a warp scheduler issue mode +-gpgpu_max_insn_issue_per_warp 1 +-gpgpu_dual_issue_diff_exec_units 1 + +## L1/shared memory configuration +# :::,::::,::,:** +# ** Optional parameter - Required when mshr_type==Texture Fifo +# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache +# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x +-gpgpu_adaptive_cache_config 1 +-gpgpu_shmem_option 0,8,16,32,64,100 +-gpgpu_unified_l1d_size 128 +# L1 cache configuration +-gpgpu_l1_banks 4 +-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32 +-gpgpu_l1_latency 39 +-gpgpu_gmem_skip_L1D 0 +-gpgpu_flush_l1_cache 1 +-gpgpu_n_cluster_ejection_buffer_size 32 +-gpgpu_l1_cache_write_ratio 25 + +# shared memory configuration +-gpgpu_shmem_size 102400 +-gpgpu_shmem_sizeDefault 102400 +-gpgpu_shmem_per_block 49152 +-gpgpu_smem_latency 29 +# shared memory bankconflict detection +-gpgpu_shmem_num_banks 32 +-gpgpu_shmem_limited_broadcast 0 +-gpgpu_shmem_warp_parts 1 +-gpgpu_coalesce_arch 86 + +# L2 cache +-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32 +-gpgpu_cache:dl2_texture_only 0 +-gpgpu_dram_partition_queues 64:64:64:64 +-gpgpu_perf_sim_memcpy 1 +-gpgpu_memory_partition_indexing 2 + +# 128 KB Inst. +-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4 +-gpgpu_inst_fetch_throughput 4 +# 128 KB Tex +# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod +-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2 +# 64 KB Const +-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4 +-gpgpu_perfect_inst_const_cache 1 + +# interconnection +# use built-in local xbar +-network_mode 2 +-icnt_in_buffer_limit 512 +-icnt_out_buffer_limit 512 +-icnt_subnets 2 +-icnt_flit_size 40 +-icnt_arbiter_algo 1 + +# memory partition latency config +-gpgpu_l2_rop_latency 187 +-dram_latency 254 + +# dram sched config +-gpgpu_dram_scheduler 1 +-gpgpu_frfcfs_dram_sched_queue_size 64 +-gpgpu_dram_return_queue_size 192 + +# dram model config +-gpgpu_n_mem_per_ctrlr 1 +-gpgpu_dram_buswidth 2 +-gpgpu_dram_burst_length 16 +-dram_data_command_freq_ratio 4 +-gpgpu_mem_address_mask 1 +-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS + +# Mem timing +-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4 +-dram_dual_bus_interface 0 + +# select lower bits for bnkgrp to increase bnkgrp parallelism +-dram_bnk_indexing_policy 0 +-dram_bnkgrp_indexing_policy 1 + +#-dram_seperate_write_queue_enable 1 +#-dram_write_queue_size 64:56:32 + +# stat collection +-gpgpu_memlatency_stat 14 +-gpgpu_runtime_stat 500 +-enable_ptx_file_line_stats 1 +-visualizer_enabled 0 + +# power model configs, disable it untill we create a real energy model +-power_simulation_enabled 0 + +# tracing functionality +#-trace_enabled 1 +#-trace_components WARP_SCHEDULER,SCOREBOARD +#-trace_sampling_core 0 + From f2d08bbe8700f136cb007a0aa1e5ad80206bd538 Mon Sep 17 00:00:00 2001 From: christindbose Date: Thu, 24 Oct 2024 12:35:08 -0400 Subject: [PATCH 2/5] num SMs/channels are in place --- .../SM90_H100/config_hopper_islip.icnt | 2 +- configs/tested-cfgs/SM90_H100/gpgpusim.config | 169 ++++++++++++------ 2 files changed, 113 insertions(+), 58 deletions(-) diff --git a/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt index 6775d5d6f..5ad7ecd48 100644 --- a/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt +++ b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt @@ -7,7 +7,7 @@ network_count = 2; // Topology topology = fly; -k = 78; +k = 144; n = 1; // Routing diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config index d26b1a621..02a099a11 100644 --- a/configs/tested-cfgs/SM90_H100/gpgpusim.config +++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config @@ -1,7 +1,45 @@ +# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas +# Northwestern University, Purdue University, The University of British Columbia +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer; +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution; +# 3. Neither the names of Northwestern University, Purdue University, +# The University of British Columbia nor the names of their contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + + +# This config models the H100 SMX5 +# For more info about hopper architecture: +#https://www.advancedclustering.com/wp-content/uploads/2022/03/gtc22-whitepaper-hopper.pdf + +# base config were taken from v100 and modified +# Items modified +# Num SMs, mem channels (HBM3 datawidth per channel) +# # functional simulator specification -gpgpu_ptx_instruction_classification 0 -gpgpu_ptx_sim_mode 0 --gpgpu_ptx_force_max_capability 86 +-gpgpu_ptx_force_max_capability 70 # Device Limits -gpgpu_stack_size_limit 1024 @@ -13,34 +51,40 @@ -gpgpu_max_concurrent_kernel 128 # Compute Capability --gpgpu_compute_capability_major 8 --gpgpu_compute_capability_minor 6 +-gpgpu_compute_capability_major 7 +-gpgpu_compute_capability_minor 0 # PTX execution-driven -gpgpu_ptx_convert_to_ptxplus 0 -gpgpu_ptx_save_converted_ptxplus 0 # high level architecture configuration --gpgpu_n_clusters 46 +-gpgpu_n_clusters 132 -gpgpu_n_cores_per_cluster 1 --gpgpu_n_mem 16 --gpgpu_n_sub_partition_per_mchannel 2 +-gpgpu_n_mem 40 +-gpgpu_n_sub_partition_per_mchannel 2 +-gpgpu_clock_gated_lanes 1 -# clock domains +# volta clock domains #-gpgpu_clock_domains ::: --gpgpu_clock_domains 1132:1132:1132:3500.5 +-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0 +# boost mode +# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0 # shader core pipeline config -gpgpu_shader_registers 65536 -gpgpu_registers_per_block 65536 --gpgpu_occupancy_sm_number 86 +-gpgpu_occupancy_sm_number 70 --gpgpu_shader_core_pipeline 1536:32 +# This implies a maximum of 64 warps/SM +-gpgpu_shader_core_pipeline 2048:32 -gpgpu_shader_cta 32 -gpgpu_simd_model 1 # Pipeline widths and number of FUs # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE +## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units +## we need to scale the number of pipeline registers to be equal to the number of SP units -gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4 -gpgpu_num_sp_units 4 -gpgpu_num_sfu_units 4 @@ -52,18 +96,18 @@ # Instruction latencies and initiation intervals # "ADD,MAX,MUL,MAD,DIV" # All Div operations are executed on SFU unit --ptx_opcode_latency_int 4,4,4,4,21 --ptx_opcode_initiation_int 2,2,2,2,2 --ptx_opcode_latency_fp 4,4,4,4,39 --ptx_opcode_initiation_fp 1,1,1,1,2 --ptx_opcode_latency_dp 64,64,64,64,330 --ptx_opcode_initiation_dp 64,64,64,64,130 --ptx_opcode_latency_sfu 21 +-ptx_opcode_latency_int 4,13,4,5,145,21 +-ptx_opcode_initiation_int 2,2,2,2,8,4 +-ptx_opcode_latency_fp 4,13,4,5,39 +-ptx_opcode_initiation_fp 2,2,2,2,4 +-ptx_opcode_latency_dp 8,19,8,8,330 +-ptx_opcode_initiation_dp 4,4,4,4,130 +-ptx_opcode_latency_sfu 100 -ptx_opcode_initiation_sfu 8 -ptx_opcode_latency_tesnor 64 -ptx_opcode_initiation_tensor 64 -# sub core model: in which each scheduler has its own register file and EUs +# Volta has sub core model, in which each scheduler has its own register file and EUs # i.e. schedulers are isolated -gpgpu_sub_core_model 1 # disable specialized operand collectors and use generic operand collectors instead @@ -71,47 +115,52 @@ -gpgpu_operand_collector_num_units_gen 8 -gpgpu_operand_collector_num_in_ports_gen 8 -gpgpu_operand_collector_num_out_ports_gen 8 -# register banks --gpgpu_num_reg_banks 8 +# volta has 8 banks, 4 schedulers, two banks per scheduler +# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version +-gpgpu_num_reg_banks 16 -gpgpu_reg_file_port_throughput 2 -# warp scheduling +# shared memory bankconflict detection +-gpgpu_shmem_num_banks 32 +-gpgpu_shmem_limited_broadcast 0 +-gpgpu_shmem_warp_parts 1 +-gpgpu_coalesce_arch 70 + +# Volta has four schedulers per core -gpgpu_num_sched_per_core 4 +# Greedy then oldest scheduler -gpgpu_scheduler lrr -# a warp scheduler issue mode +## In Volta, a warp scheduler can issue 1 inst per cycle -gpgpu_max_insn_issue_per_warp 1 -gpgpu_dual_issue_diff_exec_units 1 ## L1/shared memory configuration # :::,::::,::,:** # ** Optional parameter - Required when mshr_type==Texture Fifo -# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache +# Defualt config is 32KB DL1 and 96KB shared memory +# In Volta, we assign the remaining shared memory to L1 cache +# if the assigned shd mem = 0, then L1 cache = 128KB # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x +# disable this mode in case of multi kernels/apps execution -gpgpu_adaptive_cache_config 1 --gpgpu_shmem_option 0,8,16,32,64,100 +-gpgpu_shmem_option 0,8,16,32,64,96 -gpgpu_unified_l1d_size 128 # L1 cache configuration -gpgpu_l1_banks 4 --gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32 --gpgpu_l1_latency 39 +-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:512:8,16:0,32 +-gpgpu_l1_cache_write_ratio 25 +-gpgpu_l1_latency 20 -gpgpu_gmem_skip_L1D 0 -gpgpu_flush_l1_cache 1 -gpgpu_n_cluster_ejection_buffer_size 32 --gpgpu_l1_cache_write_ratio 25 - -# shared memory configuration --gpgpu_shmem_size 102400 --gpgpu_shmem_sizeDefault 102400 --gpgpu_shmem_per_block 49152 --gpgpu_smem_latency 29 -# shared memory bankconflict detection --gpgpu_shmem_num_banks 32 --gpgpu_shmem_limited_broadcast 0 --gpgpu_shmem_warp_parts 1 --gpgpu_coalesce_arch 86 - -# L2 cache --gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32 +# shared memory configuration +-gpgpu_shmem_size 98304 +-gpgpu_shmem_sizeDefault 98304 +-gpgpu_shmem_per_block 65536 +-gpgpu_smem_latency 20 + +# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache +-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32 -gpgpu_cache:dl2_texture_only 0 -gpgpu_dram_partition_queues 64:64:64:64 -gpgpu_perf_sim_memcpy 1 @@ -121,13 +170,15 @@ -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4 -gpgpu_inst_fetch_throughput 4 # 128 KB Tex -# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod +# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2 # 64 KB Const -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4 -gpgpu_perfect_inst_const_cache 1 # interconnection +#-network_mode 1 +#-inter_config_file config_volta_islip.icnt # use built-in local xbar -network_mode 2 -icnt_in_buffer_limit 512 @@ -137,26 +188,34 @@ -icnt_arbiter_algo 1 # memory partition latency config --gpgpu_l2_rop_latency 187 --dram_latency 254 +-gpgpu_l2_rop_latency 160 +-dram_latency 100 -# dram sched config +# dram model config -gpgpu_dram_scheduler 1 -gpgpu_frfcfs_dram_sched_queue_size 64 -gpgpu_dram_return_queue_size 192 -# dram model config +# for HBM3, 5 stacks, 40 channels, each (64 bits) 8 bytes width -gpgpu_n_mem_per_ctrlr 1 --gpgpu_dram_buswidth 2 --gpgpu_dram_burst_length 16 --dram_data_command_freq_ratio 4 +-gpgpu_dram_buswidth 8 +-gpgpu_dram_burst_length 2 +-dram_data_command_freq_ratio 2 # HBM is DDR -gpgpu_mem_address_mask 1 --gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS +-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS + +# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf) +# Timing for 1 GHZ +# tRRDl and tWTR are missing, need to be added +#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47: +# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4" -# Mem timing --gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4 --dram_dual_bus_interface 0 +# Timing for 850 MHZ, V100 HBM runs at 850 MHZ +-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40: + CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3" +# HBM has dual bus interface, in which it can issue two col and row commands at a time +-dram_dual_bus_interface 1 # select lower bits for bnkgrp to increase bnkgrp parallelism -dram_bnk_indexing_policy 0 -dram_bnkgrp_indexing_policy 1 @@ -170,11 +229,7 @@ -enable_ptx_file_line_stats 1 -visualizer_enabled 0 -# power model configs, disable it untill we create a real energy model --power_simulation_enabled 0 - # tracing functionality #-trace_enabled 1 #-trace_components WARP_SCHEDULER,SCOREBOARD #-trace_sampling_core 0 - From d075d471cdcc5407151c6dc982ccd71ea3fcd640 Mon Sep 17 00:00:00 2001 From: christindbose Date: Thu, 24 Oct 2024 16:05:43 -0400 Subject: [PATCH 3/5] L1D and L2 sizes updated --- configs/tested-cfgs/SM90_H100/gpgpusim.config | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config index 02a099a11..947255b20 100644 --- a/configs/tested-cfgs/SM90_H100/gpgpusim.config +++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config @@ -61,7 +61,7 @@ # high level architecture configuration -gpgpu_n_clusters 132 -gpgpu_n_cores_per_cluster 1 --gpgpu_n_mem 40 +-gpgpu_n_mem 80 -gpgpu_n_sub_partition_per_mchannel 2 -gpgpu_clock_gated_lanes 1 @@ -137,16 +137,16 @@ ## L1/shared memory configuration # :::,::::,::,:** # ** Optional parameter - Required when mshr_type==Texture Fifo -# Defualt config is 32KB DL1 and 96KB shared memory +# Defualt config is 160KB DL1 and 96KB shared memory # In Volta, we assign the remaining shared memory to L1 cache -# if the assigned shd mem = 0, then L1 cache = 128KB +# # if the assigned shd mem = 0, then L1 cache = 256KB # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x # disable this mode in case of multi kernels/apps execution -gpgpu_adaptive_cache_config 1 -gpgpu_shmem_option 0,8,16,32,64,96 -gpgpu_unified_l1d_size 128 # L1 cache configuration --gpgpu_l1_banks 4 +-gpgpu_l1_banks 8 -gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:512:8,16:0,32 -gpgpu_l1_cache_write_ratio 25 -gpgpu_l1_latency 20 @@ -159,8 +159,8 @@ -gpgpu_shmem_per_block 65536 -gpgpu_smem_latency 20 -# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache --gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32 +# 64 sets, each 128 bytes 40-way for each memory sub partition (320 KB per memory sub partition). This gives us 50MB L2 cache +-gpgpu_cache:dl2 S:64:128:40,L:B:m:L:P,A:192:4,32:0,32 -gpgpu_cache:dl2_texture_only 0 -gpgpu_dram_partition_queues 64:64:64:64 -gpgpu_perf_sim_memcpy 1 @@ -196,7 +196,7 @@ -gpgpu_frfcfs_dram_sched_queue_size 64 -gpgpu_dram_return_queue_size 192 -# for HBM3, 5 stacks, 40 channels, each (64 bits) 8 bytes width +# for HBM3, 5 stacks, 80 channels, each (64 bits) 8 bytes width -gpgpu_n_mem_per_ctrlr 1 -gpgpu_dram_buswidth 8 -gpgpu_dram_burst_length 2 From fa235ebcf932f82391bf8ccc9a0f69295fab6825 Mon Sep 17 00:00:00 2001 From: christindbose Date: Thu, 24 Oct 2024 16:22:11 -0400 Subject: [PATCH 4/5] small update to documentation for l1d --- configs/tested-cfgs/SM90_H100/gpgpusim.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config index 947255b20..1961449e8 100644 --- a/configs/tested-cfgs/SM90_H100/gpgpusim.config +++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config @@ -35,7 +35,8 @@ # base config were taken from v100 and modified # Items modified # Num SMs, mem channels (HBM3 datawidth per channel) -# +# L1D, L2 size +# # functional simulator specification -gpgpu_ptx_instruction_classification 0 -gpgpu_ptx_sim_mode 0 @@ -138,7 +139,7 @@ # :::,::::,::,:** # ** Optional parameter - Required when mshr_type==Texture Fifo # Defualt config is 160KB DL1 and 96KB shared memory -# In Volta, we assign the remaining shared memory to L1 cache +# In Hopper, we assign the remaining shared memory to L1 cache # # if the assigned shd mem = 0, then L1 cache = 256KB # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x # disable this mode in case of multi kernels/apps execution From 4fd1ae00c6752f384300c6fad5baa2b4c5937462 Mon Sep 17 00:00:00 2001 From: christindbose Date: Fri, 3 Jan 2025 03:34:00 -0500 Subject: [PATCH 5/5] updated cache configs --- configs/tested-cfgs/SM90_H100/gpgpusim.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config index 1961449e8..c738a56b5 100644 --- a/configs/tested-cfgs/SM90_H100/gpgpusim.config +++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config @@ -165,7 +165,7 @@ -gpgpu_cache:dl2_texture_only 0 -gpgpu_dram_partition_queues 64:64:64:64 -gpgpu_perf_sim_memcpy 1 --gpgpu_memory_partition_indexing 2 +-gpgpu_memory_partition_indexing 0 # 128 KB Inst. -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4