From 140228df3e714b49ce8cd071ac35ecbd423d19f0 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Wed, 23 Oct 2024 22:32:54 -0400
Subject: [PATCH 1/5] hopper initial

---
 .../SM90_H100/config_hopper_islip.icnt        |  74 +++++++
 configs/tested-cfgs/SM90_H100/gpgpusim.config | 180 ++++++++++++++++++
 2 files changed, 254 insertions(+)
 create mode 100644 configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
 create mode 100644 configs/tested-cfgs/SM90_H100/gpgpusim.config
diff --git a/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
new file mode 100644
index 000000000..6775d5d6f
--- /dev/null
+++ b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 78;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config
new file mode 100644
index 000000000..d26b1a621
--- /dev/null
+++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config
@@ -0,0 +1,180 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 86
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
+
+# Compute Capability
+-gpgpu_compute_capability_major 8
+-gpgpu_compute_capability_minor 6
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 46
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1132:1132:1132:3500.5
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 86
+
+-gpgpu_shader_core_pipeline 1536:32
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 1,1,1,1,2
+-ptx_opcode_latency_dp 64,64,64,64,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# sub core model: in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# register banks
+-gpgpu_num_reg_banks 8
+-gpgpu_reg_file_port_throughput 2
+
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler lrr
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32
+-gpgpu_l1_latency 39
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
+-gpgpu_shmem_size 102400
+-gpgpu_shmem_sizeDefault 102400
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 29
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
+
+# L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 187
+-dram_latency 254
+
+# dram sched config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# dram model config
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+

From f2d08bbe8700f136cb007a0aa1e5ad80206bd538 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Thu, 24 Oct 2024 12:35:08 -0400
Subject: [PATCH 2/5] num SMs/channels are in place

---
 .../SM90_H100/config_hopper_islip.icnt        |   2 +-
 configs/tested-cfgs/SM90_H100/gpgpusim.config | 169 ++++++++++++------
 2 files changed, 113 insertions(+), 58 deletions(-)

diff --git a/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
index 6775d5d6f..5ad7ecd48 100644
--- a/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
+++ b/configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
@@ -7,7 +7,7 @@ network_count = 2;
 
 // Topology
 topology = fly;
-k = 78;
+k = 144;
 n = 1;
 
 // Routing
diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config
index d26b1a621..02a099a11 100644
--- a/configs/tested-cfgs/SM90_H100/gpgpusim.config
+++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config
@@ -1,7 +1,45 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the H100 SMX5
+# For more info about hopper architecture:
+#https://www.advancedclustering.com/wp-content/uploads/2022/03/gtc22-whitepaper-hopper.pdf
+
+# base config were taken from v100 and modified
+# Items modified
+# Num SMs, mem channels (HBM3 datawidth per channel)
+#
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
--gpgpu_ptx_force_max_capability 86
+-gpgpu_ptx_force_max_capability 70 
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
@@ -13,34 +51,40 @@
 -gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
--gpgpu_compute_capability_major 8
--gpgpu_compute_capability_minor 6
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
 
 # PTX execution-driven
 -gpgpu_ptx_convert_to_ptxplus 0
 -gpgpu_ptx_save_converted_ptxplus 0
 
 # high level architecture configuration
--gpgpu_n_clusters 46
+-gpgpu_n_clusters 132	 
 -gpgpu_n_cores_per_cluster 1
--gpgpu_n_mem 16
--gpgpu_n_sub_partition_per_mchannel 2
+-gpgpu_n_mem 40
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
-# clock domains
+# volta clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1132:1132:1132:3500.5
+-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
--gpgpu_occupancy_sm_number 86
+-gpgpu_occupancy_sm_number 70
 
--gpgpu_shader_core_pipeline 1536:32
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
 -gpgpu_shader_cta 32
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
 -gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
@@ -52,18 +96,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,4,4,4,21
--ptx_opcode_initiation_int 2,2,2,2,2
--ptx_opcode_latency_fp 4,4,4,4,39
--ptx_opcode_initiation_fp 1,1,1,1,2
--ptx_opcode_latency_dp 64,64,64,64,330
--ptx_opcode_initiation_dp 64,64,64,64,130
--ptx_opcode_latency_sfu 21
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
 -ptx_opcode_initiation_sfu 8
 -ptx_opcode_latency_tesnor 64
 -ptx_opcode_initiation_tensor 64
 
-# sub core model: in which each scheduler has its own register file and EUs
+# Volta has sub core model, in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -71,47 +115,52 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# register banks
--gpgpu_num_reg_banks 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
 -gpgpu_reg_file_port_throughput 2
 
-# warp scheduling
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
 -gpgpu_scheduler lrr
-# a warp scheduler issue mode
+## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
 # <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
-# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
--gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_shmem_option 0,8,16,32,64,96
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32
--gpgpu_l1_latency 39
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_flush_l1_cache 1
 -gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_cache_write_ratio 25
-
-# shared memory  configuration
--gpgpu_shmem_size 102400
--gpgpu_shmem_sizeDefault 102400
--gpgpu_shmem_per_block 49152
--gpgpu_smem_latency 29
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 86
-
-# L2 cache
--gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 1
@@ -121,13 +170,15 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
@@ -137,26 +188,34 @@
 -icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 187
--dram_latency 254
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
 
-# dram sched config
+# dram model config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# dram model config
+# for HBM3, 5 stacks, 40 channels, each (64 bits) 8 bytes width
 -gpgpu_n_mem_per_ctrlr 1
--gpgpu_dram_buswidth 2
--gpgpu_dram_burst_length 16
--dram_data_command_freq_ratio 4
+-gpgpu_dram_buswidth 8
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
 -gpgpu_mem_address_mask 1
--gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
 
-# Mem timing 
--gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
--dram_dual_bus_interface 0
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
 
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
 -dram_bnkgrp_indexing_policy 1
@@ -170,11 +229,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
-

From d075d471cdcc5407151c6dc982ccd71ea3fcd640 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Thu, 24 Oct 2024 16:05:43 -0400
Subject: [PATCH 3/5] L1D and L2 sizes updated

---
 configs/tested-cfgs/SM90_H100/gpgpusim.config | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config
index 02a099a11..947255b20 100644
--- a/configs/tested-cfgs/SM90_H100/gpgpusim.config
+++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config
@@ -61,7 +61,7 @@
 # high level architecture configuration
 -gpgpu_n_clusters 132	 
 -gpgpu_n_cores_per_cluster 1
--gpgpu_n_mem 40
+-gpgpu_n_mem 80
 -gpgpu_n_sub_partition_per_mchannel 2 
 -gpgpu_clock_gated_lanes 1
 
@@ -137,16 +137,16 @@
 ## L1/shared memory configuration
 # <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
-# Defualt config is 32KB DL1 and 96KB shared memory
+# Defualt config is 160KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
-# if the assigned shd mem = 0, then L1 cache = 128KB
+# # if the assigned shd mem = 0, then L1 cache = 256KB
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
 -gpgpu_shmem_option 0,8,16,32,64,96
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
--gpgpu_l1_banks 4
+-gpgpu_l1_banks 8
 -gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
@@ -159,8 +159,8 @@
 -gpgpu_shmem_per_block 65536
 -gpgpu_smem_latency 20
 
-# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
--gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+# 64 sets, each 128 bytes 40-way for each memory sub partition (320 KB per memory sub partition). This gives us 50MB L2 cache
+-gpgpu_cache:dl2 S:64:128:40,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 1
@@ -196,7 +196,7 @@
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# for HBM3, 5 stacks, 40 channels, each (64 bits) 8 bytes width
+# for HBM3, 5 stacks, 80 channels, each (64 bits) 8 bytes width
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 8
 -gpgpu_dram_burst_length 2

From fa235ebcf932f82391bf8ccc9a0f69295fab6825 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Thu, 24 Oct 2024 16:22:11 -0400
Subject: [PATCH 4/5] small update to documentation for l1d

---
 configs/tested-cfgs/SM90_H100/gpgpusim.config | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config
index 947255b20..1961449e8 100644
--- a/configs/tested-cfgs/SM90_H100/gpgpusim.config
+++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config
@@ -35,7 +35,8 @@
 # base config were taken from v100 and modified
 # Items modified
 # Num SMs, mem channels (HBM3 datawidth per channel)
-#
+# L1D, L2 size
+# 
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
@@ -138,7 +139,7 @@
 # <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 160KB DL1 and 96KB shared memory
-# In Volta, we assign the remaining shared memory to L1 cache 
+# In Hopper, we assign the remaining shared memory to L1 cache 
 # # if the assigned shd mem = 0, then L1 cache = 256KB
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution

From 4fd1ae00c6752f384300c6fad5baa2b4c5937462 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Fri, 3 Jan 2025 03:34:00 -0500
Subject: [PATCH 5/5] updated cache configs

---
 configs/tested-cfgs/SM90_H100/gpgpusim.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/tested-cfgs/SM90_H100/gpgpusim.config b/configs/tested-cfgs/SM90_H100/gpgpusim.config
index 1961449e8..c738a56b5 100644
--- a/configs/tested-cfgs/SM90_H100/gpgpusim.config
+++ b/configs/tested-cfgs/SM90_H100/gpgpusim.config
@@ -165,7 +165,7 @@
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 1
--gpgpu_memory_partition_indexing 2
+-gpgpu_memory_partition_indexing 0
 
 # 128 KB Inst.
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4