Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hopper config #80

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions configs/tested-cfgs/SM90_H100/config_hopper_islip.icnt
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//21*1 fly with 32 flits per packet under gpgpusim injection mode
use_map = 0;
flit_size = 40;

// currently we do not use this, see subnets below
network_count = 2;

// Topology
topology = fly;
k = 144;
n = 1;

// Routing

routing_function = dest_tag;


// Flow control

num_vcs = 1;
vc_buf_size = 256;
input_buffer_size = 256;
ejection_buffer_size = 256;
boundary_buffer_size = 256;

wait_for_tail_credit = 0;

// Router architecture

vc_allocator = islip; //separable_input_first;
sw_allocator = islip; //separable_input_first;
alloc_iters = 1;

credit_delay = 0;
routing_delay = 0;
vc_alloc_delay = 1;
sw_alloc_delay = 1;

input_speedup = 1;
output_speedup = 1;
internal_speedup = 2.0;

// Traffic, GPGPU-Sim does not use this

traffic = uniform;
packet_size ={{1,2,3,4},{10,20}};
packet_size_rate={{1,1,1,1},{2,1}};

// Simulation - Don't change

sim_type = gpgpusim;
//sim_type = latency;
injection_rate = 0.1;

subnets = 2;

// Always use read and write no matter following line
//use_read_write = 1;


read_request_subnet = 0;
read_reply_subnet = 1;
write_request_subnet = 0;
write_reply_subnet = 1;

read_request_begin_vc = 0;
read_request_end_vc = 0;
write_request_begin_vc = 0;
write_request_end_vc = 0;
read_reply_begin_vc = 0;
read_reply_end_vc = 0;
write_reply_begin_vc = 0;
write_reply_end_vc = 0;

236 changes: 236 additions & 0 deletions configs/tested-cfgs/SM90_H100/gpgpusim.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
# Northwestern University, Purdue University, The University of British Columbia
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:

# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer;
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution;
# 3. Neither the names of Northwestern University, Purdue University,
# The University of British Columbia nor the names of their contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.


# This config models the H100 SMX5
# For more info about hopper architecture:
#https://www.advancedclustering.com/wp-content/uploads/2022/03/gtc22-whitepaper-hopper.pdf

# base config were taken from v100 and modified
# Items modified
# Num SMs, mem channels (HBM3 datawidth per channel)
# L1D, L2 size
#
# functional simulator specification
-gpgpu_ptx_instruction_classification 0
-gpgpu_ptx_sim_mode 0
-gpgpu_ptx_force_max_capability 70

# Device Limits
-gpgpu_stack_size_limit 1024
-gpgpu_heap_size_limit 8388608
-gpgpu_runtime_sync_depth_limit 2
-gpgpu_runtime_pending_launch_count_limit 2048
-gpgpu_kernel_launch_latency 5000
-gpgpu_TB_launch_latency 0
-gpgpu_max_concurrent_kernel 128

# Compute Capability
-gpgpu_compute_capability_major 7
-gpgpu_compute_capability_minor 0

# PTX execution-driven
-gpgpu_ptx_convert_to_ptxplus 0
-gpgpu_ptx_save_converted_ptxplus 0

# high level architecture configuration
-gpgpu_n_clusters 132
-gpgpu_n_cores_per_cluster 1
-gpgpu_n_mem 80
-gpgpu_n_sub_partition_per_mchannel 2
-gpgpu_clock_gated_lanes 1

# volta clock domains
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
# boost mode
# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0

# shader core pipeline config
-gpgpu_shader_registers 65536
-gpgpu_registers_per_block 65536
-gpgpu_occupancy_sm_number 70

# This implies a maximum of 64 warps/SM
-gpgpu_shader_core_pipeline 2048:32
-gpgpu_shader_cta 32
-gpgpu_simd_model 1

# Pipeline widths and number of FUs
# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
## we need to scale the number of pipeline registers to be equal to the number of SP units
-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
-gpgpu_num_sp_units 4
-gpgpu_num_sfu_units 4
-gpgpu_num_dp_units 4
-gpgpu_num_int_units 4
-gpgpu_tensor_core_avail 1
-gpgpu_num_tensor_core_units 4

# Instruction latencies and initiation intervals
# "ADD,MAX,MUL,MAD,DIV"
# All Div operations are executed on SFU unit
-ptx_opcode_latency_int 4,13,4,5,145,21
-ptx_opcode_initiation_int 2,2,2,2,8,4
-ptx_opcode_latency_fp 4,13,4,5,39
-ptx_opcode_initiation_fp 2,2,2,2,4
-ptx_opcode_latency_dp 8,19,8,8,330
-ptx_opcode_initiation_dp 4,4,4,4,130
-ptx_opcode_latency_sfu 100
-ptx_opcode_initiation_sfu 8
-ptx_opcode_latency_tesnor 64
-ptx_opcode_initiation_tensor 64

# Volta has sub core model, in which each scheduler has its own register file and EUs
# i.e. schedulers are isolated
-gpgpu_sub_core_model 1
# disable specialized operand collectors and use generic operand collectors instead
-gpgpu_enable_specialized_operand_collector 0
-gpgpu_operand_collector_num_units_gen 8
-gpgpu_operand_collector_num_in_ports_gen 8
-gpgpu_operand_collector_num_out_ports_gen 8
# volta has 8 banks, 4 schedulers, two banks per scheduler
# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
-gpgpu_num_reg_banks 16
-gpgpu_reg_file_port_throughput 2

# shared memory bankconflict detection
-gpgpu_shmem_num_banks 32
-gpgpu_shmem_limited_broadcast 0
-gpgpu_shmem_warp_parts 1
-gpgpu_coalesce_arch 70

# Volta has four schedulers per core
-gpgpu_num_sched_per_core 4
# Greedy then oldest scheduler
-gpgpu_scheduler lrr
## In Volta, a warp scheduler can issue 1 inst per cycle
-gpgpu_max_insn_issue_per_warp 1
-gpgpu_dual_issue_diff_exec_units 1

## L1/shared memory configuration
# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
# ** Optional parameter - Required when mshr_type==Texture Fifo
# Defualt config is 160KB DL1 and 96KB shared memory
# In Hopper, we assign the remaining shared memory to L1 cache
# # if the assigned shd mem = 0, then L1 cache = 256KB
# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
# disable this mode in case of multi kernels/apps execution
-gpgpu_adaptive_cache_config 1
-gpgpu_shmem_option 0,8,16,32,64,96
-gpgpu_unified_l1d_size 128
# L1 cache configuration
-gpgpu_l1_banks 8
-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
-gpgpu_l1_cache_write_ratio 25
-gpgpu_l1_latency 20
-gpgpu_gmem_skip_L1D 0
-gpgpu_flush_l1_cache 1
-gpgpu_n_cluster_ejection_buffer_size 32
# shared memory configuration
-gpgpu_shmem_size 98304
-gpgpu_shmem_sizeDefault 98304
-gpgpu_shmem_per_block 65536
-gpgpu_smem_latency 20

# 64 sets, each 128 bytes 40-way for each memory sub partition (320 KB per memory sub partition). This gives us 50MB L2 cache
-gpgpu_cache:dl2 S:64:128:40,L:B:m:L:P,A:192:4,32:0,32
-gpgpu_cache:dl2_texture_only 0
-gpgpu_dram_partition_queues 64:64:64:64
-gpgpu_perf_sim_memcpy 1
-gpgpu_memory_partition_indexing 0

# 128 KB Inst.
-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
-gpgpu_inst_fetch_throughput 4
# 128 KB Tex
# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
# 64 KB Const
-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
-gpgpu_perfect_inst_const_cache 1

# interconnection
#-network_mode 1
#-inter_config_file config_volta_islip.icnt
# use built-in local xbar
-network_mode 2
-icnt_in_buffer_limit 512
-icnt_out_buffer_limit 512
-icnt_subnets 2
-icnt_flit_size 40
-icnt_arbiter_algo 1

# memory partition latency config
-gpgpu_l2_rop_latency 160
-dram_latency 100

# dram model config
-gpgpu_dram_scheduler 1
-gpgpu_frfcfs_dram_sched_queue_size 64
-gpgpu_dram_return_queue_size 192

# for HBM3, 5 stacks, 80 channels, each (64 bits) 8 bytes width
-gpgpu_n_mem_per_ctrlr 1
-gpgpu_dram_buswidth 8
-gpgpu_dram_burst_length 2
-dram_data_command_freq_ratio 2 # HBM is DDR
-gpgpu_mem_address_mask 1
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS

# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
# Timing for 1 GHZ
# tRRDl and tWTR are missing, need to be added
#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
# CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"

# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"

# HBM has dual bus interface, in which it can issue two col and row commands at a time
-dram_dual_bus_interface 1
# select lower bits for bnkgrp to increase bnkgrp parallelism
-dram_bnk_indexing_policy 0
-dram_bnkgrp_indexing_policy 1

#-dram_seperate_write_queue_enable 1
#-dram_write_queue_size 64:56:32

# stat collection
-gpgpu_memlatency_stat 14
-gpgpu_runtime_stat 500
-enable_ptx_file_line_stats 1
-visualizer_enabled 0

# tracing functionality
#-trace_enabled 1
#-trace_components WARP_SCHEDULER,SCOREBOARD
#-trace_sampling_core 0
Loading