diff --git a/.github/workflows/supported_devices_platforms_md.yml b/.github/workflows/supported_devices_platforms_md.yml new file mode 100644 index 0000000000..79d333aa17 --- /dev/null +++ b/.github/workflows/supported_devices_platforms_md.yml @@ -0,0 +1,20 @@ +# This is a basic workflow to help you get started with GitHub Actions + +name: Run the shell script to generate sonic image links in supported_devices_platforms_md.sh in a dedicated branch + +# Run the script once in a minute. Github may take 15 minutes to run this even though we request once in a minute, which is OK. +on: + schedule: + - cron: '5 * * * *' + +jobs: + build: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + - uses: actions/checkout@v2 + + - name: Run the script to build a json file with details about various builds for various platforms + run: sh ./supported_devices_platforms_md.sh diff --git a/MoM.html b/MoM.html index c7f98aee4f..449922cdab 100755 --- a/MoM.html +++ b/MoM.html @@ -104,6 +104,131 @@

SONiC community meeting minutes

Links To Meeting Agenda Links To Minutes Of The meeting + +   Feb 22 2022    + auto-techsupport + MoM + + +   Feb 15 2022    + No Meeting + MoM + + +   Feb 08 2022    + Auto Ng & Link training + MoM + + +   Feb 01 2022    + Counter delay via config_db + MoM + + +   Jan 25 2021    + Add VLAN Stacking + MoM + + +   Jan 18 2021    + Deterministic approach in SONiC for Interface Link bring-up sequence + MoM + + +   Jan 11 2021    + Password Hardening + MoM + + +   Jan 04 2021    + No Meeting + MoM + + +   Dec 28 2021    + Route Flow Counter + MoM + + +   Dec 21 2021    + Route Flow Counter + MoM + + +   Dec 14 2021    + Route Flow Counter + MoM + + +   Dec 07 2021    + 202205 Release Roadmap Discussion + MoM + + +   Nov 30 2021    + 202111 Release Tracking + MoM + + +   Nov 23 2021    + No Meeting + MoM + + +   Nov 16 2021    + Release Discussion + MoM + + +   Nov 09 2021    + Sorted Next Hop List HLD + MoM + + +   Nov 02 2021    + The SONiC-Based Framework for SAI Testing and Integration + MoM + + +   Oct 26 2021    + 202111 Release Tracking + MoM + + +   Oct 19 2021    + 202111 Release Tracking + MoM + + +   Oct 12 2021    + NVGRE & Dynamic Policy Based Hashing + MoM + + +   Oct 05 2021    + CMIS Diagnostics & System Ready Enhancements + MoM + + +   Sep 28 2021    + Host Interface counters & Guidelines for reference proprietary code + MoM + + +   Sep 21 2021    + SONiC TACACS per command authorization and accounting & MPLS EXP to TC maps + MoM + + +   Sep 14 2021    + ECMP Overlay BFD support + MoM + + +   Sep 07 2021    + Handle port config change on fly in xcvrd & Reclaim reserved buffer for unused ports + MoM +   Aug 31 2021    Show Running Command Enhancement diff --git a/Supported-Devices-and-Platforms.html b/Supported-Devices-and-Platforms.html index 442917cacf..53d8a72eef 100644 --- a/Supported-Devices-and-Platforms.html +++ b/Supported-Devices-and-Platforms.html @@ -1,4 +1,8 @@ + + + + @@ -40,314 +44,495 @@ - Page-4 - + Page-5 + Actor lifeline.1378 ConfigDB - - Sheet.1175 - + + Sheet.1001 + - - Sheet.1176 - + + Sheet.1002 + - - Sheet.1177 + + Sheet.1003 - - Sheet.1178 - + + Sheet.1004 + - - - ConfigDB + + + ConfigDB - + Object lifeline.1383 PbhOrch - - Sheet.1180 - + + Sheet.1006 + - - Sheet.1181 - + + Sheet.1007 + - - Sheet.1182 + + Sheet.1008 - - Sheet.1183 - + + Sheet.1009 + - - - PbhOrch + + + PbhOrch - + Object lifeline.1388 AclOrch - - Sheet.1185 - + + Sheet.1011 + - - Sheet.1186 - + + Sheet.1012 + - - Sheet.1187 + + Sheet.1013 - - Sheet.1188 - + + Sheet.1014 + - - - AclOrch + + + AclOrch - + Object lifeline.1393 SAI - - Sheet.1190 - + + Sheet.1016 + - - Sheet.1191 - + + Sheet.1017 + - - Sheet.1192 + + Sheet.1018 - - Sheet.1193 - + + Sheet.1019 + - - - SAI + + + SAI - + Activation.1398 - + - + Activation.1399 - + - + Asynchronous Message.1400 PBH_HASH_FIELD|name - - - PBH_HASH_FIELD|name - + + + PBH_HASH_FIELD|name + Self Message.1401 process data - - - process data - + + + process data + Asynchronous Message.1402 PBH_TABLE|name - - - PBH_TABLE|name - + + + PBH_TABLE|name + Message.1412 updateAclTable - - - updateAclTable - + + + updateAclTable + Activation.1413 - + - + Return Message.1414 return <result> - - - return <result> - + + + return <result> + Message.1418 set_acl_table_attribute - - - set_acl_table_attribute - + + + set_acl_table_attribute + Return Message.1419 return <status> - - - return <status> - + + + return <status> + Activation.1430 - + - + Self Message.1431 process data - - - process data - + + + process data + Activation.1452 - + - + Self Message.1453 updatePbhHashField - - - updatePbhHashField - + + + updatePbhHashField + Activation.1451 - + - + Message.1455 set_fine_grained_hash_field_attribute - - - set_fine_grained_hash_field_attribute - + + + set_fine_grained_hash_field_attribute + Activation.1456 - + - + Return Message.1457 return <status> - - - return <status> - + + + return <status> + Asynchronous Message.1458 PBH_HASH|name - - - PBH_HASH|name - + + + PBH_HASH|name + Activation.1459 - + - + Self Message.1460 updatePbhHash - - - updatePbhHash - + + + updatePbhHash + Activation.1461 - + - + Message.1462 set_hash_attribute - - - set_hash_attribute - + + + set_hash_attribute + Activation.1463 - + - + Return Message.1464 return <status> - - - return <status> - + + + return <status> + Asynchronous Message.1466 PBH_RULE|name - - - PBH_RULE|name - + + + PBH_RULE|name + Message.1469 updateAclRule - - - updateAclRule - + + + updateAclRule + Activation.1470 - + - + Return Message.1471 return <result> - - - return <result> - + + + return <result> + Message.1472 set_acl_entry_attribute - - - set_acl_entry_attribute - + + + set_acl_entry_attribute + Return Message.1473 return <status> - - - return <status> - + + + return <status> + Activation.1565 - + - + Self Message.1566 process data - - - process data - + + + process data + Activation.1567 - + - + Self Message.1568 updatePbhTable - - - updatePbhTable - + + + updatePbhTable + Activation.1569 - + - + Activation.1673 - + - + Activation.1573 - + - + Self Message.1675 process data - - - process data - + + + process data + Activation.1575 - + - + Self Message.1576 updatePbhRule - - - updatePbhRule - + + + updatePbhRule + Activation.1577 - + - + Activation.1579 - + + + Loop fragment.877 + + + + + Sheet.1064 + transaction + + transaction + + Sheet.1065 + HSET/HDEL + HSET/HDEL + + + Actor lifeline.1087 + CLI + + Sheet.1067 + + + + Sheet.1068 + + + + Sheet.1069 + + + Sheet.1070 + + + + + CLI + + + Asynchronous Message.1092 + PBH_HASH_FIELD|key|field + + + PBH_HASH_FIELD|key|field + + Activation.1093 + + + + Asynchronous Message.1073 + PBH_HASH_FIELD|key|field + + + PBH_HASH_FIELD|key|field + + Asynchronous Message.1076 + PBH_HASH_FIELD|key|field + + + PBH_HASH_FIELD|key|field + + Sheet.1075 + . . . + + . . . + + Loop fragment.1097 + + + + + Sheet.1077 + transaction + + transaction + + Sheet.1078 + HSET/HDEL + HSET/HDEL + + + Asynchronous Message.1100 + PBH_HASH|key|field + + + PBH_HASH|key|field + + Asynchronous Message.1101 + PBH_HASH|key|field + + + PBH_HASH|key|field + + Asynchronous Message.1102 + PBH_HASH|key|field + + + PBH_HASH|key|field + + Sheet.1082 + . . . + + . . . + + Loop fragment.1104 + + + + + Sheet.1084 + transaction + + transaction + + Sheet.1085 + HSET/HDEL + HSET/HDEL + + + Asynchronous Message.1107 + PBH_TABLE|key|field + + + PBH_TABLE|key|field + + Asynchronous Message.1108 + PBH_TABLE|key|field + + + PBH_TABLE|key|field + + Asynchronous Message.1109 + PBH_TABLE|key|field + + + PBH_TABLE|key|field + + Sheet.1089 + . . . + + . . . + + Loop fragment.1115 + + + + + Sheet.1091 + transaction + + transaction + + Sheet.1092 + HSET/HDEL + HSET/HDEL + + + Asynchronous Message.1118 + PBH_RULE|key|field + + + PBH_RULE|key|field + + Asynchronous Message.1119 + PBH_RULE|key|field + + + PBH_RULE|key|field + + Asynchronous Message.1120 + PBH_RULE|key|field + + + PBH_RULE|key|field + + Sheet.1096 + . . . + + . . . diff --git a/doc/pbh/pbh-design.md b/doc/pbh/pbh-design.md index 1ffb411f8d..548de5c744 100644 --- a/doc/pbh/pbh-design.md +++ b/doc/pbh/pbh-design.md @@ -28,11 +28,19 @@ - [2.4.1.2 PBH rule](#2412-pbh-rule) - [2.4.1.3 PBH hash](#2413-pbh-hash) - [2.4.1.4 PBH hash field](#2414-pbh-hash-field) - - [2.4.2 Configuration sample](#242-configuration-sample) + - [2.4.2 State DB](#242-state-db) + - [2.4.2.1 PBH table](#2421-pbh-table) + - [2.4.2.2 PBH rule](#2422-pbh-rule) + - [2.4.2.3 PBH hash](#2423-pbh-hash) + - [2.4.2.4 PBH hash field](#2424-pbh-hash-field) + - [2.4.3 Data sample](#243-data-sample) + - [2.4.4 Configuration sample](#244-configuration-sample) - [2.5 Flows](#25-flows) - - [2.5.1 PBH add](#251-pbh-add) - - [2.5.2 PBH update](#252-pbh-update) - - [2.5.3 PBH remove](#253-pbh-remove) + - [2.5.1 Key modification](#251-key-modification) + - [2.5.1.1 PBH add](#2511-pbh-add) + - [2.5.1.2 PBH remove](#2512-pbh-remove) + - [2.5.2 Field modification](#252-field-modification) + - [2.5.2.1 PBH update](#2521-pbh-update) - [2.6 CLI](#26-cli) - [2.6.1 Command structure](#261-command-structure) - [2.6.2 Usage examples](#262-usage-examples) @@ -46,10 +54,11 @@ ## Revision -| Rev | Date | Author | Description | -|:---:|:----------:|:--------------:|:-------------------------------------------| -| 0.1 | 15/03/2021 | Nazarii Hnydyn | Initial version | -| 0.2 | 07/06/2021 | Nazarii Hnydyn | Update DB schema: introduce PBH hash field | +| Rev | Date | Author | Description | +|:---:|:----------:|:--------------:|:------------------------------------------------| +| 0.1 | 15/03/2021 | Nazarii Hnydyn | Initial version | +| 0.2 | 07/06/2021 | Nazarii Hnydyn | Update DB schema: introduce PBH hash field | +| 0.3 | 15/11/2021 | Nazarii Hnydyn | PBH modification flows: introduce field set/del | ## About this manual @@ -96,8 +105,8 @@ This document describes the high level design of PBH feature in SONiC [Figure 1: PBH design](#figure-1-pbh-design) [Figure 2: PBH OA design](#figure-2-pbh-oa-design) [Figure 3: PBH add flow](#figure-3-pbh-add-flow) -[Figure 4: PBH update flow](#figure-4-pbh-update-flow) -[Figure 5: PBH remove flow](#figure-5-pbh-remove-flow) +[Figure 4: PBH remove flow](#figure-4-pbh-remove-flow) +[Figure 5: PBH update flow](#figure-5-pbh-update-flow) ## List of tables @@ -203,8 +212,10 @@ A custom hashing can be configured for Regular/FG ECMP and LAG. ###### Figure 2: PBH OA design A `PbhOrch` class with a set of data structures will be implemented to handle PBH feature. -OA will be extended with a new PBH Config DB schema and SAI FG Hash API support. +OA will be extended with a new PBH Config DB/State DB schema and SAI FG Hash API support. PBH table/rule/hash/hash-field updates will be processed by OA based on Config DB changes. +Each update operation will be verified against generic/vendor specific capabilities. +Generic/Vendor specific capabilities by default will be stored in State DB by OA. Some object updates will be handled and some will be considered as invalid. ### 2.3.2 PBH orch @@ -244,6 +255,19 @@ On hash field create, `PbhOrch` will verify if the hash field already exists. Cr exists will be treated as an update. Regular hash field add/remove will update the internal class structures and appropriate SAI objects will be created or deleted. +PBH object modification concept allows to do a fine-grained field/value tuple management. +For that purpose a PBH capabilities table will be introduced. Each PBH key will have it's own set of +field capabilities defined in a State DB. + +PBH capabilities: +1. ADD - field can be set to the redis hash in case it does not exist yet +2. UPDATE - field can be set to the redis hash in case it already exists +3. REMOVE - field can be deleted from the redis hash in case it does exist + +In general, PBH capabilities represent a mix of SAI interface/vendor restrictions. +When special policy is not required, a generic SAI-based implementation will be used by OA. +Platform/Vendor identification will be done via `platform` environment variable. + **Skeleton code:** ```cpp class PbhOrch : public Orch @@ -285,7 +309,7 @@ class AclOrch : public Orch, public Observer ... bool updateAclTable(string table_id, AclTable &table); - bool updateAclRule(string table_id, string rule_id, bool enableCounter); + bool updateAclRule(shared_ptr updatedAclRule); ... }; @@ -358,7 +382,7 @@ public: bool validateAddMatch(const sai_attribute_t &attr); bool validateAddAction(const sai_attribute_t &attr); bool validate() override; - void update(SubjectType, void *) override; + void onUpdate(SubjectType, void *) override; }; ``` @@ -472,7 +496,245 @@ ip-mask = ipv4-addr / ipv6-addr **Note:** field _ip_mask_ is only valid when _hash_field_ equals _INNER_DST/SRC_IPV4_ or _INNER_DST/SRC_IPV6_ -### 2.4.2 Configuration sample +### 2.4.2 State DB + +#### 2.4.2.1 PBH table +```abnf +; defines schema for PBH table capabilities +key = PBH_CAPABILITIES|table ; must be unique + +; field = value +interface_list = capabilities +description = capabilities + +; value annotations +capabilities = "" \ "ADD" \ "UPDATE" \ "REMOVE" \ + "ADD" "," "UPDATE" \ + "ADD" "," "REMOVE" \ + "UPDATE" "," "ADD" \ + "UPDATE" "," "REMOVE" \ + "REMOVE" "," "ADD" \ + "REMOVE" "," "UPDATE" \ + "ADD" "," "UPDATE" "," "REMOVE" +``` + +#### 2.4.2.2 PBH rule +```abnf +; defines schema for PBH rule capabilities +key = PBH_CAPABILITIES|rule ; must be unique + +; field = value +priority = capabilities +gre_key = capabilities +ether_type = capabilities +ip_protocol = capabilities +ipv6_next_header = capabilities +l4_dst_port = capabilities +inner_ether_type = capabilities +hash = capabilities +packet_action = capabilities +flow_counter = capabilities + +; value annotations +capabilities = "" \ "ADD" \ "UPDATE" \ "REMOVE" \ + "ADD" "," "UPDATE" \ + "ADD" "," "REMOVE" \ + "UPDATE" "," "ADD" \ + "UPDATE" "," "REMOVE" \ + "REMOVE" "," "ADD" \ + "REMOVE" "," "UPDATE" \ + "ADD" "," "UPDATE" "," "REMOVE" +``` + +#### 2.4.2.3 PBH hash +```abnf +; defines schema for PBH hash capabilities +key = PBH_CAPABILITIES|hash ; must be unique + +; field = value +hash_field_list = capabilities + +; value annotations +capabilities = "" \ "ADD" \ "UPDATE" \ "REMOVE" \ + "ADD" "," "UPDATE" \ + "ADD" "," "REMOVE" \ + "UPDATE" "," "ADD" \ + "UPDATE" "," "REMOVE" \ + "REMOVE" "," "ADD" \ + "REMOVE" "," "UPDATE" \ + "ADD" "," "UPDATE" "," "REMOVE" +``` + +#### 2.4.2.4 PBH hash field +```abnf +; defines schema for PBH hash field capabilities +key = PBH_CAPABILITIES|hash-field ; must be unique + +; field = value +hash_field = capabilities +ip_mask = capabilities +sequence_id = capabilities + +; value annotations +capabilities = "" \ "ADD" \ "UPDATE" \ "REMOVE" \ + "ADD" "," "UPDATE" \ + "ADD" "," "REMOVE" \ + "UPDATE" "," "ADD" \ + "UPDATE" "," "REMOVE" \ + "REMOVE" "," "ADD" \ + "REMOVE" "," "UPDATE" \ + "ADD" "," "UPDATE" "," "REMOVE" +``` + +### 2.4.3 Data sample + +**Config DB:** +```bash +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_ip_proto' +1) "hash_field" +2) "INNER_IP_PROTOCOL" +3) "sequence_id" +4) "1" + +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_l4_dst_port' +1) "hash_field" +2) "INNER_L4_DST_PORT" +3) "sequence_id" +4) "2" + +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_l4_src_port' +1) "hash_field" +2) "INNER_L4_SRC_PORT" +3) "sequence_id" +4) "2" + +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_dst_ipv4' +1) "hash_field" +2) "INNER_DST_IPV4" +3) "ip_mask" +4) "255.0.0.0" +5) "sequence_id" +6) "3" + +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_src_ipv4' +1) "hash_field" +2) "INNER_SRC_IPV4" +3) "ip_mask" +4) "0.0.0.255" +5) "sequence_id" +6) "3" + +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_dst_ipv6' +1) "hash_field" +2) "INNER_DST_IPV6" +3) "ip_mask" +4) "ffff::" +5) "sequence_id" +6) "4" + +redis-cli -n 4 HGETALL 'PBH_HASH_FIELD|inner_src_ipv6' +1) "hash_field" +2) "INNER_SRC_IPV6" +3) "ip_mask" +4) "::ffff" +5) "sequence_id" +6) "4" + +redis-cli -n 4 HGETALL 'PBH_HASH|inner_v4_hash' +1) "hash_field_list@" +2) "inner_ip_proto,inner_l4_dst_port,inner_l4_src_port,inner_dst_ipv4,inner_src_ipv4" + +redis-cli -n 4 HGETALL 'PBH_HASH|inner_v6_hash' +1) "hash_field_list@" +2) "inner_ip_proto,inner_l4_dst_port,inner_l4_src_port,inner_dst_ipv6,inner_src_ipv6" + +redis-cli -n 4 HGETALL 'PBH_RULE|pbh_table|nvgre' + 1) "ether_type" + 2) "0x0800" + 3) "flow_counter" + 4) "DISABLED" + 5) "gre_key" + 6) "0x2500/0xffffff00" + 7) "hash" + 8) "inner_v6_hash" + 9) "inner_ether_type" +10) "0x86dd" +11) "ip_protocol" +12) "0x2f" +13) "packet_action" +14) "SET_ECMP_HASH" +15) "priority" +16) "2" + +redis-cli -n 4 HGETALL 'PBH_RULE|pbh_table|vxlan' + 1) "ether_type" + 2) "0x0800" + 3) "flow_counter" + 4) "ENABLED" + 5) "hash" + 6) "inner_v4_hash" + 7) "inner_ether_type" + 8) "0x0800" + 9) "ip_protocol" +10) "0x11" +11) "l4_dst_port" +12) "0x12b5" +13) "packet_action" +14) "SET_LAG_HASH" +15) "priority" +16) "1" + +redis-cli -n 4 HGETALL 'PBH_TABLE|pbh_table' +1) "description" +2) "NVGRE and VxLAN" +3) "interface_list@" +4) "Ethernet0,Ethernet4,PortChannel0001,PortChannel0002" +``` + +**State DB:** +```bash +redis-cli -n 6 HGETALL 'PBH_CAPABILITIES|table' + 1) "interface_list" + 2) "UPDATE" + 3) "description" + 4) "UPDATE" + +redis-cli -n 6 HGETALL 'PBH_CAPABILITIES|rule' + 1) "priority" + 2) "UPDATE" + 3) "ether_type" + 4) "ADD,UPDATE,REMOVE" + 5) "ip_protocol" + 6) "ADD,UPDATE,REMOVE" + 7) "ipv6_next_header" + 8) "ADD,UPDATE,REMOVE" + 9) "l4_dst_port" + 10) "ADD,UPDATE,REMOVE" + 11) "gre_key" + 12) "ADD,UPDATE,REMOVE" + 13) "inner_ether_type" + 14) "ADD,UPDATE,REMOVE" + 15) "hash" + 16) "UPDATE" + 17) "packet_action" + 18) "ADD,UPDATE,REMOVE" + 19) "flow_counter" + 20) "ADD,UPDATE,REMOVE" + +redis-cli -n 6 HGETALL 'PBH_CAPABILITIES|hash' + 1) "hash_field_list" + 2) "UPDATE" + +redis-cli -n 6 HGETALL 'PBH_CAPABILITIES|hash-field' + 1) "hash_field" + 2) "" + 3) "ip_mask" + 4) "" + 5) "sequence_id" + 6) "" +``` + +### 2.4.4 Configuration sample **Inner 5-tuple hashing:** ```json @@ -569,23 +831,27 @@ ip-mask = ipv4-addr / ipv6-addr ## 2.5 Flows -### 2.5.1 PBH add +### 2.5.1 Key modification + +#### 2.5.1.1 PBH add ![PBH add flow](images/pbh_add_flow.svg "Figure 3: PBH add flow") ###### Figure 3: PBH add flow -### 2.5.2 PBH update +#### 2.5.1.2 PBH remove -![PBH update flow](images/pbh_update_flow.svg "Figure 4: PBH update flow") +![PBH remove flow](images/pbh_remove_flow.svg "Figure 4: PBH remove flow") -###### Figure 4: PBH update flow +###### Figure 4: PBH remove flow -### 2.5.3 PBH remove +### 2.5.2 Field modification -![PBH remove flow](images/pbh_remove_flow.svg "Figure 5: PBH remove flow") +#### 2.5.2.1 PBH update -###### Figure 5: PBH remove flow +![PBH update flow](images/pbh_update_flow.svg "Figure 5: PBH update flow") + +###### Figure 5: PBH update flow ## 2.6 CLI @@ -602,7 +868,10 @@ config | |--- rule | |--- add OPTIONS - | |--- update OPTIONS + | |--- update + | | |--- field + | | |--- set OPTIONS + | | |--- del OPTIONS | |--- delete | |--- hash @@ -675,7 +944,11 @@ config pbh rule add 'pbh_table' 'nvgre' \ --hash 'inner_v6_hash' \ --packet-action 'SET_ECMP_HASH' \ --flow-counter 'DISABLED' -config pbh rule update 'pbh_table' 'nvgre' \ +config pbh rule update field del 'pbh_table' 'nvgre' \ +--ip-protocol +config pbh rule update field set 'pbh_table' 'nvgre' \ +--ether-type '0x86dd' \ +--ipv6-next-header '0x2f' \ --flow-counter 'ENABLED' config pbh rule delete 'pbh_table' 'nvgre' ``` @@ -924,6 +1197,11 @@ PBH basic configuration test: 3. Verify ASIC DB object count after PBH hash creation/removal 4. Verify ASIC DB object count after PBH hash field creation/removal +PBH basic update test: +1. Verify ASIC DB object state after PBH table update +2. Verify ASIC DB object state after PBH rule update +3. Verify ASIC DB object state after PBH hash update + PBH extended configuration test: 1. Create inner 5-tuple PBH hash fields 2. Create PBH hash diff --git a/doc/pins/Packet_io.md b/doc/pins/Packet_io.md index cfd22220de..d2be71fca1 100644 --- a/doc/pins/Packet_io.md +++ b/doc/pins/Packet_io.md @@ -16,17 +16,18 @@ _Rev v0.1_ Rev | RevDate | Author(s) | Change Description ---- | ---------- | ----------- | ------------------ v0.1 | 06/23/2021 | Google, ONF | Initial Version +v0.2 | 03/31/2022 | Google, ONF | Extract Library ## Scope -This document covers the high level design aspects of Packet I/O in SONiC for P4Runtime application. +This document covers the high level design aspects of Packet I/O in SONiC for both the P4Runtime application and any subsequent application that would benefit from packet exchange using Generic Netlink Sockets. ## Overview -SONiC supports Packet I/O on Linux netdev interfaces but this does not meet some unique requirements of P4Runtime application. This document details the requirements and captures the design changes necessary to meet the new requirements. +SONiC supports Packet I/O on Linux netdev interfaces but this does not meet some unique requirements of P4Runtime application. This document details the requirements and captures the design changes necessary to meet the new requirements. This document also explains the extraction of a library from the P4Runtime application in order to make the functionality available to other applications as required. In addition to the new library this document will explain a new application which serves as a model for using the library and allows for TCPDump like sniffing capability and packet generation using Generic Netlink Sockets. **Requirements** @@ -176,5 +177,58 @@ Vendor work is needed to enable the creation of the “submit_to_ingress” port ![drawing](images/p4rt_flow.png) +## New Library Usage + +The library functionality is defined in genl-packet/receive_genetlink.h. First a callback function must be defined with the following signature: + +``` +using ReceiveCallbackFunction = std::function; +``` +Then the StartReceive function is called which returns a thread which calls the callback function on packet receipt.  +``` +std::thread StartReceive( + packet_metadata::ReceiveCallbackFunction callback_function, + nl_recvmsg_msg_cb_t process_callback_function); +``` +A version of process_callback_function is implemented in genl-packet/receive_genenlink.cc and will be used if process_callback_function is NULL. The purpose of process_callback_function is to extract the netlink attributes: source port, destination port and payload and pass them to the ReceiveCallbackFunction. This can be implemented by the library user if the logic in the library is insufficient. + +## Library Phases +- **Current:** + - The current kernel module, documented in PacketIO.md, remains unchanged and is supplied by the asic vendor or delegates. + - There is a single multicast group / queue that all producers and consumers can use.  All packets are sent to all consumers.  Filtering must be done post-consumption. + - Packet meta-data is fixed. + +- **Dynamic generic netlink:** + - The kernel module will have an API that allows user space applications to manage multicast groups and queues at runtime. + - The library will allow applications to specify which multicast group / queue to listen to and allow for basic filtering. + - Packet meta-data continues to be fixed. +- **Dynamic meta-data:** + - Kernel module is likely unchanged. + - Multicast group / queue functionality is likely unchanged. + - Packet meta-data can be specified by the using application.   + +## Sniffer Application: +The sniffer provides the means of a tcpdump-like tool to listen to the genetlink device. The sniffer can be used for listening to traffic, as well as recording the traffic into a file or displaying to standard out. The resulting pcapng file can then be viewed using Wireshark. The sender can be used to send an example packet or packets from a pcap/pcapng file through genetlink. The sender also registers a new genetlink family and group called genl_packet and packets respectively. Both sniffer and sender use the pcapplusplus library which is an actively maintained open source library. + +Both the sender and the sniffer can be compiled via bazel or sonic-buildimage. Either way, once compiled or the necessary binary installed the following commands can be used to use the two applications ([sniffer] indicates the sniffer application and [sender] indicates the sender application): +``` +- [sniffer] : launches the sniffer and records all packets into a file named out.pcapng. +- [sniffer] -a : will either append the packets to out.pcapng or to a custom filename if given. +- [sniffer] -o=- : This will print the hex representation of the received packets to standard out. +- [sniffer] -o=hello.pcapng : By providing a filename you can write the genetlink packets into a given file. In this example it will be hello.pcapng. +- [sniffer] -verbose : will print out verbose information about the packets received including metadata and packet contents. +``` + +The packet metadata carried with process_callback_function gets put into a comment in the pcapng. If the sniffer is to be run outside of P4Runtime the user might want to construct their own custom receive thread using customCallbackReceive found in the header file for the sniffer, since the carried metadata might be different. + +``` +- sudo [sender] : will send a sample packet using genetlink. +- sudo [sender] -inputfile=hello.pcapng : will read the packets from a given file and send them via genetlink. +- sudo [sender] -packet=AABBCCDD : will send the given packet in hex representation via genetlink. +``` + + diff --git a/doc/port-add-del-dynamically/dynamic_port_add_del_hld.md b/doc/port-add-del-dynamically/dynamic_port_add_del_hld.md new file mode 100644 index 0000000000..076df35da4 --- /dev/null +++ b/doc/port-add-del-dynamically/dynamic_port_add_del_hld.md @@ -0,0 +1,305 @@ +# Enhancements to add or del ports dynamically + + +# Table of Contents + * [Revision](#revision) + * [About This Manual](#about-this-manual) + * [Scope](#scope) + * [Initialization stage](#init-stage) + * [Post init stage](#post-init) + + + +#### Revision +| Rev | Date | Author | Change Description | +|:---:|:-------:|:------------------:|:------------------:| +| 0.1 | 2021-09 | Tomer Israel | Initial Version | + + +## Motivation +The feature is to support adding or removing ports from the system dynamically after init stage. +The system can start with all the ports on config db or only several ports from the full ports or without any ports on config db (zero ports system). +The ports will be added or removed through the port table on config db. +Before removing a port the user is responsible to remove all dependencies of this port before removing it. + + +# About this Manual +This document provides general information about ports creation or removal in SONiC. The creation of ports on the init stage and creating or removing ports after init stage. +# Scope +This document describes the high level design of orchagent and the impact of creating/removing ports dynamically on other services. The design describes the current implementaion and suggestion to changes that needs to be implemented in order to fully support the dynamic create/remove of ports. + +## Relevant PRs +[PR #7999 Allow cfggen to work on system without ports](https://github.com/Azure/sonic-buildimage/pull/7999)
+[PR #1860 Remove buffer drop counter when port is deleted](https://github.com/Azure/sonic-swss/pull/1860)
+[PR #1808 [swss]: Allow portsyncd to run on system without ports](https://github.com/Azure/sonic-swss/pull/1808)
+[PR #2019 [orchagent] add & remove port counters dynamically each time port was added or removed](https://github.com/Azure/sonic-swss/pull/2019)
+[PR #2022 Dynamic port configuration - add port buffer cfg to the port ref counter](https://github.com/Azure/sonic-swss/pull/2022)
+ +## Design + + + + +# Initialization stage + + + ![Init stage](images/init_stage_diagram.png) + +- **Portsyncd** read port config db info and push it to App db and will set PortConfigDone on App db when finished. +- **Portsorch** (orchagent) for every port added to the APP DB … will create port through SAI call and create also host interface for each time port is added to port APP table. +- **Portsyncd** will receive netlink notification for each host interface that was created, and update an entry on state db +- When all host interfaces are created **Portsyncd** is setting PortInitDone. + + +### App DB flags: +PortConfigDone – finished to configure ports on init +PortInitDone – all host interfaces were created + +Some services are waiting for these flags before they continue to run: +Orchagent is waiting for PortConfigDone before continuing to create the ports on SAI. +Xcvrd, buffermgrd, natmgr, natsync – waiting for PortInitDone + +## Init types: +The Dynamic port add/remove configuration will be supported for all types of init types:
+• Start the system with full ports on config db
+• Start the system without some of the ports on config db
+• Start the system with zero ports on config db
+ +**Note:** This is a new type of init that was never tested and will be supported.
+The zero-port system is a special case of this feature.
+Few PRs were already added in order to support zero ports init:
+[PR #7999 Allow cfggen to work on system without ports](https://github.com/Azure/sonic-buildimage/pull/7999)
+[PR #1860 Remove buffer drop counter when port is deleted](https://github.com/Azure/sonic-swss/pull/1860)
+[PR #1808 [swss]: Allow portsyncd to run on system without ports](https://github.com/Azure/sonic-swss/pull/1808)
+ +after init stage we can add/remove ports dynamically through redis call to add/remove entry to/from port table on config db ("PORT") + +## Init with zero ports: +Starting with zero ports requires new SKU for zero ports with these changes:
+**Hwsku.json** – without interfaces
+**Platform.json** – without interfaces
+**Sai profile** needs to be without port entries.
+ + +On this zero ports SKU the sonic-cfggen will generate config_db.json file without any ports.
+  + + +# Post init stage - dynamically + +#### Add port: + + ![Add port](images/add_port_diagram.png) + +1. A process or a user can add port entry to the port table on Config DB. For example, the line card manager will add port entry to the port table. +2. On portsyncd - Port set event is received from Config DB. +3. Portsyncd is adding the new port info to App DB +4. On portsorch (orchagent) - Port set event is received from App DB. +5. Portsorch is creating the port on SAI. +6. SDK is creating the port and the host interfaces. +7. Host interface is created and Netlink event received on portsyncd. +8. Portsyncd is adding a new port entry on state db. +9. Events from ASIC DB received on portsorch when operstate are changing (up or down). +10. Portsorch are updating the operstate on App DB + + + +#### Del port: + +Del Port – Remove port element from config DB +Note: before removing a port, the port needs to be without any dependencies (ACL, VLAN, LAG, buffer pg). +For example: we need to remove the buffer pg that configured to a port and then remove the port. + + ![Remove port](images/remove_port_diagram.png) + +1. Before we remove a port, we need to remove all dependencies of these ports (vlan, acl, buffer…) +2. A process or a user can remove port entry from the port table on Config DB. For example, line card manager will remove port entry. +3. On portmgrd we receive delete event from Config DB. +4. Portmgrd will remove this entry on App DB. +5. Portsorch will receive remove entry event from the App DB. +6. Portsorch will delete the port and the host interface on SAI. +7. SAI will remove this port on SDK +8. Host interface will be removed and netlink event will be received on portsyncd. +9. Portsyncd will remove the port entry from state db + + +## Modules that “listen” to changes on config port table, App port table and State port table + +#### SWSS - Portsyncd: +• ADD PORT - Receive new port from port config table, add the port info to APP DB (update speed, interface_type, autoneg, adv_speeds, adv_interface_types).
+when host interface was created add this port entry to state db
+• DEL PORT – portmgrd is removing this entry from app db.
+when host interface was removed remove this port entry from state db + + +#### SWSS - Portsorch: + +• ADD PORT - Receive new port from port APP table -> create port on SAI -> create host interface -> add Flex counters
+Receive notification from ASIC DB when oper_state is changing, update the port oper_state on APP db. +• DEL PORT - Receive del port from port APP table -> remove flex counters -> del port on SAI -> del host interface
+ +Currently the orchagent is adding/removing these flex counters: +- PORT_BUFFER_DROP_STAT_FLEX_COUNTER_GROUP
+- PORT_STAT_COUNTER_FLEX_COUNTER_GROUP
+ + + +Changes needs to be added:
+We need to add more port counters that will be add/removed dynamically whenever port is created or removed: +- Queue port counters (queue & queue watermark counters) +- PG counters +- Debug counters: port ingress drops (DEBUG_COUNTER config table) +- Debug counters: port egress drops +- Pfc watchdog counters + +In the current implementation these counters were created for all ports only after init stage is done. + + +** Counters PR: **
+[https://github.com/Azure/sonic-swss/pull/2019](https://github.com/Azure/sonic-swss/pull/2019) + + + +#### PortMgrd: +- ADD Port: Set (admin_status, mtu, learn_mode, tpid) from config db to App db
+- Del port: Receive del port operation from port config table, remove this port from APP DB.
+ +**No need to change the code** + +#### Sflowmgr: +Add port: Event from config db - Update the speed to sflow internal db.
+Del port: Delete event from config db - remove the speed from sflow internal db.
+ +**No need to change the code** + +#### Teammgrd: +Listen to events from config db: +set event -> add the port to lag (check before if entry exist on state db - host interface exist)
+del event -> remove port from lag
+ +Listen to events from state db: +set event -> add the port to lag (the below content is taken from the teammgr code to describe this flow):
+When a port gets removed and created again, notification is triggered +when state dabatabase gets updated. In this situation, the port needs +to be enslaved into the LAG again. +del event -> do nothing
+ +**No need to change the code** + +#### Macsecmgr: +Listen to events on cfg port table – the service will enable or disable macsec if macsec was configured on the port cfg table (using the macsec field)
+ +**No need to change the code** + +#### snampagent: +• Add/remove port has no special treatment.
+each time the snmpagent needs information from ports (oper_state, mtu, speed..) it reads from APP port table. Will be triggered on mib requests.
+ +**No need to change the code** + +#### PMON - Xcvrd: +Listen to events on cfg port table and update transeiver information
+ +implemented on those PRs: +https://github.com/Azure/sonic-buildimage/pull/8422
+https://github.com/Azure/sonic-platform-daemons/pull/212 + + +## Buffermgrd: + +##### Add port: +- If a port is added without a buffer configuration the buffer configuration the SDK will “decide” the default buffer values for this port. +- The user can add the port on admin state down -> later add the buffer configuration (static or dynamic) -> enable this port. +For example, in the line-card system case: +- When line card is provisioned the line card manager is adding a port to config db, need to add the port with admin state “down”. +- Line card manager will add the buffer configuration for this port through a default buffer cfg template. +- Line cart manager will enable the port. + +• Pg_profile_lookup file has values that will be used for static buffer configuration. +for each port speed and cable length we have buffer size value, xon and xoff value
+For example: + + +|speed cable | cable | size | xon | xoff | threshold | +|:----------:|:-----:|:-----:|:-----:|:-----:|:----------:| +| 10000 | 5m | 49152 | 19456 | 29696 | 0 | +| 25000 | 5m | 49152 | 19456 | 29696 | 0 | +| 40000 | 5m | 49152 | 19456 | 29696 | 0 | +| 50000 | 5m | 49152 | 19456 | 29696 | 0 | + +On the line-card system we will use different types of line cards (maybe with different gearboxes), the values on the pg_profile_lookup will be used for all the types.
+we may need to consider using pg_profile_lookup.ini for each line card type.
+• When port is added to the config db – the speed and the admin state is saved on internal db
+• After port was added the user can add buffer configuration to this port (dynamic or static configuration) and only then the buffermgr will set the buffer configuration on App table
+ +• We have rare situation of race condition in the add port flow:
+ + ![possible buffermgr race condition](images/buffermgr_possible_race.png) + +in order to avoid this issue we need to check first if port exist on App db before adding the buffer cfg to the App db. + +##### Del port: +• Before removing a port all buffer configuration needs to be removed
+ +We have also possible way for race condition:
+ + ![possible buffermgr delete port race condition](images/buffermgr_possible_delete_race.png) + +• If the portsyncd is “quicker” than the buffermgr the orchagent will try to remove the port from SAI before the buffer configuration was removed.
+• Need to test this scenario in order to check if this race condition is reproducing or it’s rare scenario
+• Solution for this:
+Need to add to orchagent the ability to add the buffer configuration of a port and increase a reference counter for each port, in the same way ACL cfg on port is working. We already have infrastructure for this just need to add the buffer cfg to use it. If a port has with buffer cfg on – this port will not be removed. + +If we will not use this mechanism we will get a lot of SAI error and with this ref counter method we will receive only one warning. Also we wanted the buffer configuration to be the same as ACL/VLAN/INTERFACE configuration, which uses the ref counter for the dependencies, and before removing a port we check this ref counter. + +** Buffer changes PR: **
+[https://github.com/Azure/sonic-swss/pull/2022](https://github.com/Azure/sonic-swss/pull/2022) + + + +#### LLDP – lldpmgrd – implementation today: +• Add port: receive port entry set on port config db -> check if oper state is up or wait until oper state up event is received from app db  add lldp port entry with lldpcli command +• Del port: when host interface is removed from system lldp configuration is removed also. + + + + + + ![Add port - LLDP- current implementation](images/lldp_before.png) + + + + + +The Problem: +- when a port is added - the lldpcli execution can failed since the host interface is not yet up. oper_state on on APP DB is up but host interface is not up yet. +- when lldp is removed immediatly after add the lldpcli command wasn't executed yet, the command is still on pending_cmds, there is treatment to remove a port, on lldpmgrd, the command for this port will stay forever on the pending_cmds end each 10 seconds (timeout value) the command will be executed and failed since the host interface is no longer exist. + +Suggested change: +- Before executing lldpcli command we will check if host interface is up by checking the port status on state db
+- When receiving del event from App db we will remove the command (if exist) from pending_cmds
+- in the current implementation we receive events on port create/update from config db and app db and later read from config db again - this implementation is unnecessary and can be problematic in several cases. +- The lldpcli is a tool that we can use in order to add ports to lldp.
+ + ![add port - LLDP- suggested change](images/lldp_after.png) + + +### VS test + +1. Basic test (init with full ports):
+ - Start the system with full ports on system
+ - Remove one port
+ - Verify port was removed properly
+ - Add the port back to the system
+ - Verify port was added properly
+ - do the above in a loop several times + - Run ping and verify basic functionality of ports
+2. Basic test (init with full ports):
+ - Start the system with full ports on system
+ - Remove all the ports
+ - Verify all ports were removed properly
+ - Add the ports back to the system
+ - Verify ports was added properly
+ - Run traffic and verify basic functionality of ports
+ - do the above in a loop several times + diff --git a/doc/port-add-del-dynamically/images/add_flex_counters_diagram.png b/doc/port-add-del-dynamically/images/add_flex_counters_diagram.png new file mode 100755 index 0000000000..4bfdba587c Binary files /dev/null and b/doc/port-add-del-dynamically/images/add_flex_counters_diagram.png differ diff --git a/doc/port-add-del-dynamically/images/add_port_diagram.png b/doc/port-add-del-dynamically/images/add_port_diagram.png new file mode 100755 index 0000000000..630eed005a Binary files /dev/null and b/doc/port-add-del-dynamically/images/add_port_diagram.png differ diff --git a/doc/port-add-del-dynamically/images/buffermgr_possible_delete_race.png b/doc/port-add-del-dynamically/images/buffermgr_possible_delete_race.png new file mode 100755 index 0000000000..ad9a2ebff6 Binary files /dev/null and b/doc/port-add-del-dynamically/images/buffermgr_possible_delete_race.png differ diff --git a/doc/port-add-del-dynamically/images/buffermgr_possible_race.png b/doc/port-add-del-dynamically/images/buffermgr_possible_race.png new file mode 100755 index 0000000000..05382cb14d Binary files /dev/null and b/doc/port-add-del-dynamically/images/buffermgr_possible_race.png differ diff --git a/doc/port-add-del-dynamically/images/init_stage_diagram.png b/doc/port-add-del-dynamically/images/init_stage_diagram.png new file mode 100755 index 0000000000..59e8352a86 Binary files /dev/null and b/doc/port-add-del-dynamically/images/init_stage_diagram.png differ diff --git a/doc/port-add-del-dynamically/images/lldp_after.png b/doc/port-add-del-dynamically/images/lldp_after.png new file mode 100755 index 0000000000..b6f9e1fd94 Binary files /dev/null and b/doc/port-add-del-dynamically/images/lldp_after.png differ diff --git a/doc/port-add-del-dynamically/images/lldp_before.png b/doc/port-add-del-dynamically/images/lldp_before.png new file mode 100755 index 0000000000..e48382ee30 Binary files /dev/null and b/doc/port-add-del-dynamically/images/lldp_before.png differ diff --git a/doc/port-add-del-dynamically/images/remove_port_diagram.png b/doc/port-add-del-dynamically/images/remove_port_diagram.png new file mode 100755 index 0000000000..52b9b72694 Binary files /dev/null and b/doc/port-add-del-dynamically/images/remove_port_diagram.png differ diff --git a/doc/port_auto_neg/port-auto-negotiation-design.md b/doc/port_auto_neg/port-auto-negotiation-design.md index a621a2b065..41a9ef83ae 100644 --- a/doc/port_auto_neg/port-auto-negotiation-design.md +++ b/doc/port_auto_neg/port-auto-negotiation-design.md @@ -139,7 +139,7 @@ The related port attributes are listed below: Please note that `SAI_PORT_ATTR_ADVERTISED_INTERFACE_TYPE` is a new attribute introduced in SAI 1.7.1. Vendors need to implement this attribute in their SAI implementation. -### Configuration and management +### Configuration and management #### CLI Enhancements @@ -183,7 +183,7 @@ Return: error message if interface_name or speed_list is invalid otherwise empty Note: - speed_list value "all" means all supported speeds + speed_list value "all" means all supported speeds ``` This command always replace the advertised speeds instead of append. For example, say the current advertised speeds value are "10000,25000", if user configure it with `config interface advertised-speeds Ethernet0 40000,100000`, the advertised speeds value will be changed to "40000,100000". @@ -227,7 +227,7 @@ Return: error message if interface_name or interface_type_list is invalid otherwise empty Note: - interface_type_list value "all" means all supported interface type + interface_type_list value "all" means all supported interface type ``` This command always replace the advertised interface types instead of append. For example, say the current advertised interface types value are "KR4,SR4", if user configure it with `config interface advertised-types Ethernet0 CR4`, the advertised interface types value will be changed to "CR4". @@ -346,7 +346,7 @@ The advantage here is that user can get a list of valid interface type from CLI. For speed and adv_speeds, there is a SAI API to get the supported speed list for a given port. The idea here is to query supported speed after orchagent creating port object, and the supported speed list will be save to STATE_DB for CLI to validate. A new field **supported_speeds** will be added to **PORT_TABLE**. If this field is present, CLI will use this field to validate the input speed and adv_speeds argument, otherwise, no validation will be performed on CLI side. The STATE_DB change will be described in [State DB Enhancements](#state-db-enhancements). -#### Config DB Enhancements +#### Config DB Enhancements SONiC already defined two fields related to port speed setting: **speed**, **autoneg**. 3 new fields **adv_speeds**, **interface_type**, **adv_interface_types** will be added to **PORT** table: @@ -396,9 +396,23 @@ To support validate interface speed on CLI side, a new field **supported_speeds* ; field = value ... supported_speeds = STRING ; supported speed list + speed = STRING ; operational speed An example value of supported_speeds could be "10000,25000,40000,100000". +Before this feature, port speed in APP DB indicates both the configured speed and the operational speed. It is OK without this feature because port operational speed must be configured speed or port operational status is down. However, this is not true with this feature. Consider following flow: + +1. Configure port speed to 100G +2. Configure advertised speed to 50G and enable auto negotiation +3. Port operational speed turns to 50G +4. Configure any port attribute, e.g mtu, portsyncd would put all port attributes to APP DB +5. `show interface status` displays port speed as 100G which is incorrect + +To overcome this issue, following changes are required: + +1. Put port operational speed to STATE DB PORT_TABLE +2. intfutil, portstat, voqutil shall be change to get port operational speed from STATE DB first. For backward compatible, intfutil, portstat, voqutil shall still get port operational speed from APP DB if port speed is not available in STATE DB or port operational state is down. + #### YANG Model Enhancements The port yang model needs to update according to DB schema change. The yang model changes of new fields are described below: @@ -440,7 +454,7 @@ In current SONiC implementation, if auto negotiation is enabled, it uses the `sp ... "autoneg": "1", "speed": "100000" -} +} ``` Will be migrated to: diff --git a/doc/qos/dscp-remapping-images/Bounced-back-traffic-deadlock.png b/doc/qos/dscp-remapping-images/Bounced-back-traffic-deadlock.png new file mode 100644 index 0000000000..c393533241 Binary files /dev/null and b/doc/qos/dscp-remapping-images/Bounced-back-traffic-deadlock.png differ diff --git a/doc/qos/dscp-remapping-images/Bounced-back-traffic-flow-PFC.png b/doc/qos/dscp-remapping-images/Bounced-back-traffic-flow-PFC.png new file mode 100644 index 0000000000..d34702a688 Binary files /dev/null and b/doc/qos/dscp-remapping-images/Bounced-back-traffic-flow-PFC.png differ diff --git a/doc/qos/dscp-remapping-images/Bounced-back-traffic-flow.png b/doc/qos/dscp-remapping-images/Bounced-back-traffic-flow.png new file mode 100644 index 0000000000..4b9aa16718 Binary files /dev/null and b/doc/qos/dscp-remapping-images/Bounced-back-traffic-flow.png differ diff --git a/doc/qos/dynamically-headroom-calculation.md b/doc/qos/dynamically-headroom-calculation.md index 3bd4c33f33..7145c0d4a7 100644 --- a/doc/qos/dynamically-headroom-calculation.md +++ b/doc/qos/dynamically-headroom-calculation.md @@ -746,7 +746,20 @@ Let's imagine what will happen after a XOFF frame has been sent for a priority. 1. MAC/PHY delay, which is the bytes held in the SWITCH CHIP's egress pipeline and PHY when XOFF has been generated. 2. Gearbox delay, which is the latency caused by the Gearbox, if there is one. 3. KB on cable, which is the bytes held in the cable, which is equals the time required for packet to travel from one end of the cable to the other multiplies the port's speed. Obviously, the time is equal to cable length divided by speed of the light in the media. -4. Peer response time, which is the bytes that are held in the peer switch's pipeline and will be send out when the XOFF packet is received. +4. Peer response time. When a switch receives a pause frame, it will not stop the packet transmission immediately, because it needs to drain the frames which already been submitted to the MAC layer. So extra buffer shall be considered to handle the peer delay response. IEEE 802.3 31B.3.7 defines how many pause_quanta shall wait upon an XOFF. A pause_quanta is equal to the time required to transmit 512 bits of a frame at the data rate of the MAC. At different operating speeds, the number of pause_quanta shall be taken are also different. Following table shows the number of pause_quanta that shall be taken for each speed. + + + | Operating speed | Number of pause_quanta | + |:--------:|:-----------------------------:| + | 100 Mb/s | 1 | + | 1 Gb/s | 2 | + | 10 Gb/s | 67 | + | 25 Gb/s | 80 | + | 40 Gb/s | 118 | + | 50 Gb/s | 147 | + | 100 Gb/s | 394 | + | 200 Gb/s | 453 | + | 400 Gb/s | 905 | Let's consider the flow of XOFF packet generating and handling: @@ -773,6 +786,9 @@ Therefore, headroom is calculated as the following: - `cell occupancy` = (100 - `small packet percentage` + `small packet percentage` * `worst case factor`) / 100 - `kb on cable` = `cable length` / `speed of light in media` * `port speed` - `kb on gearbox` = `port speed` * `gearbox delay` / 8 / 1024 +- `peer response` = + - if can get a valid pause quanta, `peer response` = (`number of pause_quanta` * 512) / 8 + - otherwise, use the default value, `peer response`: ASIC_TABLE|\|peer_response_time - `propagation delay` = `port mtu` + 2 * (`kb on cable` + `kb on gearbox`) + `mac/phy delay` + `peer response` - `Xon` = `pipeline latency` - `Xoff` = `lossless mtu` + `propagation delay` * `cell occupancy` diff --git a/doc/qos/mpls_tc_to_tc_map.md b/doc/qos/mpls_tc_to_tc_map.md new file mode 100644 index 0000000000..0bd36ee2f3 --- /dev/null +++ b/doc/qos/mpls_tc_to_tc_map.md @@ -0,0 +1,138 @@ +# MPLS TC to TC map + +## 1. Table of Content + +- [MPLS TC to TC map](#mpls-tc-to-tc-map) + - [1. Table of Content](#1-table-of-content) + - [2. Revision](#2-revision) + - [3. Scope](#3-scope) + - [4. Definitions/Abbreviations](#4-definitionsabbreviations) + - [5. Overview](#5-overview) + - [6. Requirements](#6-requirements) + - [7. Architecture Design](#7-architecture-design) + - [8. High-Level Design](#8-high-level-design) + - [8.1. DB](#81-db) + - [8.2. sonic-swss-common](#82-sonic-swss-common) + - [8.3. sonic-swss](#83-sonic-swss) + - [8.4. sonic-utilities](#84-sonic-utilities) + - [8.5. Other implications](#85-other-implications) + - [9. SAI API](#9-sai-api) + - [10. Configuration and management](#10-configuration-and-management) + - [10.1. CLI/YANG model Enhancements](#101-cliyang-model-enhancements) + - [10.2. Config DB Enhancements](#102-config-db-enhancements) + - [11. Warmboot and Fastboot Design Impact](#11-warmboot-and-fastboot-design-impact) + - [12. Restrictions/Limitations](#12-restrictionslimitations) + - [13. Testing Requirements/Design](#13-testing-requirementsdesign) + - [13.1. Unit Test cases](#131-unit-test-cases) + - [13.2. System Test cases](#132-system-test-cases) + - [14. Open/Action items - if any](#14-openaction-items---if-any) + +## 2. Revision + +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:-----------------------:|--------------------------------------------| +| 0.1 | 16/08/2021 | Alexandru Banu | Initial version | +| 0.2 | 21/09/2021 | Alexandru Banu | Renamed MPLS EXP to MPLS TC per RFC 5462 | +| 0.3 | 22/09/2021 | Alexandru Banu | Added per-port binding configuration | + +## 3. Scope + +This HLD extends SONiC to support MPLS TC to TC mappings. + +## 4. Definitions/Abbreviations + +TC = Traffic Class +QoS = Quality of Service + +## 5. Overview + +This new enhancement adds support to SONiC for MPLS TC to TC map which allows QoS to work on MPLS packets. + +## 6. Requirements + +User can configure MPLS TC to TC map at start-of-day via configuration file. CLI support will exist to offer the same amount of support as for DSCP to TC map. + +## 7. Architecture Design + +The overall SONiC architecture will not be changed and no new sub-modules will be introduced. + +## 8. High-Level Design + +### 8.1. DB + +The CONFIG DB will be updated to include a new "MPLS_TC_TO_TC_MAP" similar to the existing "DSCP_TO_TC_MAP". This will have the following format: +``` +### MPLS_TC_TO_TC_MAP + ; MPLS TC to TC map + ;SAI mapping - qos_map object with SAI_QOS_MAP_ATTR_TYPE == sai_qos_map_type_t::SAI_QOS_MAP_MPLS_EXP_TO_TC + key = "MPLS_TC_TO_TC_MAP|"name + ;field value + mpls_tc_value = 1*DIGIT + tc_value = 1*DIGIT + + Example: + 127.0.0.1:6379> hgetall "MPLS_TC_TO_TC_MAP|Mpls_tc_to_tc_map1" + 1) "3" ;mpls tc + 2) "3" ;tc + 3) "6" + 4) "5" + 5) "7" + 6) "5" +``` + +In order to allow a user to bind such a map to a port, the existing `PORT_QOS_MAP` table will be enhanced to allow a new field-value pair, where the field is going to be named `mpls_tc_to_tc_map` and the value will be the `MPLS_TC_TO_TC_MAP.key` of the map to use. + +### 8.2. sonic-swss-common + +sonic-swss-common's schema will be updated to include a CFG_MPLS_TC_TO_TC_MAP_TABLE_NAME define for the new table name. + +### 8.3. sonic-swss + +sonic-swss's QoS orch will be updated to include a new handler for MPLS TC to TC map, similar to the existing DSCP to TC map but with extra input validations, checking that the values are in the correct numeric range and that no MPLS TC value is mapped to more than one TC value. Among debugging logs, appropriate error logs will be introduced to let the user know if they miss-configured a map. + +Also, the QoS orch will be enhanced to configure the new field-value pair in `PORT_QOS_MAP` mentioned at section 8.1. + +### 8.4. sonic-utilities + +sonic-utilities will be updated to offer the same amount of support for CLI commands that DSCP to TC map already provide. + +### 8.5. Other implications + +There are no other implications. SAI and sairedis already support for MPLS TC to TC map. In terms of warm restart / fastboot / scalability / performance and so on, this should not represent an impact. + +## 9. SAI API + +MPLS TC to TC map are already supported in SAI. + +## 10. Configuration and management + +### 10.1. CLI/YANG model Enhancements + +CLI config commands will be updated to include the same level of support for MPLS TC to TC maps as for DSCP to TC maps. Namely, `config reload` and `config clear` will be updated to include the new mapping table as well. + +### 10.2. Config DB Enhancements + +The relevant changes have been described in HLD's DB sub-section. + +## 11. Warmboot and Fastboot Design Impact + +Not impacted by the changes. + +## 12. Restrictions/Limitations + +- User can't configure MPLS TC to TC map via CLI (only via reload command). +- User can't configure per-switch or per-inseg MPLS TC to TC maps. + +## 13. Testing Requirements/Design + +### 13.1. Unit Test cases + +The QoS UTs present in sonic-swss will be extended to accommodate the new MPLS TC to TC map. These will largely follow the DSCP to TC map example but will add input validation checks as well. The new code will have full code coverage as far as the UT framework allows it. + +### 13.2. System Test cases + +No system test cases will be added. + +## 14. Open/Action items - if any + +N/A diff --git a/doc/qos/reclaim-reserved-buffer-images/create-port-profile-list.jpg b/doc/qos/reclaim-reserved-buffer-images/create-port-profile-list.jpg new file mode 100644 index 0000000000..b6f9282d10 Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/create-port-profile-list.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/create-queue.jpg b/doc/qos/reclaim-reserved-buffer-images/create-queue.jpg new file mode 100644 index 0000000000..0bc703234f Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/create-queue.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/deploy.jpg b/doc/qos/reclaim-reserved-buffer-images/deploy.jpg new file mode 100644 index 0000000000..8210d1027e Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/deploy.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/dynamic-new.jpg b/doc/qos/reclaim-reserved-buffer-images/dynamic-new.jpg new file mode 100644 index 0000000000..8408184620 Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/dynamic-new.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/dynamic-original.jpg b/doc/qos/reclaim-reserved-buffer-images/dynamic-original.jpg new file mode 100644 index 0000000000..3785384ef8 Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/dynamic-original.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/dynamic-port-init.jpg b/doc/qos/reclaim-reserved-buffer-images/dynamic-port-init.jpg new file mode 100644 index 0000000000..fbc0b25fff Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/dynamic-port-init.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/normal.jpg b/doc/qos/reclaim-reserved-buffer-images/normal.jpg new file mode 100644 index 0000000000..29530e8b06 Binary files /dev/null and b/doc/qos/reclaim-reserved-buffer-images/normal.jpg differ diff --git a/doc/qos/reclaim-reserved-buffer-images/reclaim-reserved-buffer-sequence-flow.md b/doc/qos/reclaim-reserved-buffer-images/reclaim-reserved-buffer-sequence-flow.md new file mode 100644 index 0000000000..9b017e4ff5 --- /dev/null +++ b/doc/qos/reclaim-reserved-buffer-images/reclaim-reserved-buffer-sequence-flow.md @@ -0,0 +1,271 @@ +```mermaid +%Scripts can be rendered online by https://mermaid-js.github.io/mermaid-live-editor/edit +%Deploy flow +sequenceDiagram + participant User + participant CLI + participant minigraph + participant sonic cfggen + participant buffer template + participant DATABASE + User ->> minigraph: set device type + loop for each used port + User ->> minigraph: set speed + User ->> minigraph: set neighbor device name + User ->> minigraph: set neighbor device detail + User ->> minigraph: set other info (not related to buffer) + end + User ->>+ CLI: Execute "config load-minigraph" + CLI ->>+ sonic cfggen: load minigraph + sonic cfggen ->>+ minigraph: Load minigraph information + minigraph ->>- sonic cfggen: Return minigraph info + sonic cfggen ->> DATABASE: Set device type: ToRRouter, LeafRouter, or SpineRouter + loop for each port + sonic cfggen ->> DATABASE: Set port admin status to up if port is active + sonic cfggen ->> DATABASE: Set port speed + sonic cfggen ->> DATABASE: Set port's cable length according to both ends + end + Note over sonic cfggen, buffer template: Collect active ports and inactive ports + loop for each port + alt Neithbor is defined for the port + sonic cfggen ->> buffer template: Add port to ACTIVE PORT set + else + rect rgb(255, 0, 255) + sonic cfggen ->> buffer template: Add port to INACTIVE PORT set + end + end + end + sonic cfggen ->> sonic cfggen: Determine switch's topology according to its device type + sonic cfggen ->> buffer template: Load buffer template according to SKU and topo + buffer template ->> sonic cfggen: Return buffer templates + Note over sonic cfggen, DATABASE: Generating buffer table items by rendering buffer templates. + sonic cfggen ->> DATABASE: Generate default buffer pool objects + sonic cfggen ->> DATABASE: Generate default buffer profile objects + rect rgb(255, 0, 255) + opt INACTIVE PORT is not empty + sonic cfggen ->> DATABASE: Generate default zero buffer profile objects + end + end + loop for each active port + sonic cfggen ->> DATABASE: Generate BUFFER_QUEUE item for queue 0-2, 3-4, 5-6 for the port + sonic cfggen ->> DATABASE: Generate BUFFER_PORT_INGRESS_PROFILE_LIST item + sonic cfggen ->> DATABASE: Generate BUFFER_PORT_EGRESS_PROFILE_LIST item + Note over sonic cfggen, DATABASE: Generat lossy PGs by rendering the buffer template if NO special script to generate them + sonic cfggen ->> DATABASE: Generate lossy BUFFER_PG item PG 0 for the port, using normal ingress lossy buffer profile + end + rect rgb(255, 0, 255) + opt zero profiles exist + Note over sonic cfggen, DATABASE: Generate items for inactive ports by rendering bufer template if zero profiles exist + loop for each inactive port + sonic cfggen ->> DATABASE: Generate zero buffer profile item in BUFFER_QUEUE table for queue 0-7 + sonic cfggen ->> DATABASE: Generate zero buffer profile item in BUFFER_PORT_INGRESS_PROFILE_LIST table + sonic cfggen ->> DATABASE: Generate zero buffer profile item in BUFFER_PORT_EGRESS_PROFILE_LIST table + sonic cfggen ->> DATABASE: Generate zero buffer profile item for lossy PG 0 in BUFFER_PG table + end + end + end + sonic cfggen ->>- CLI: Finish +``` + +```mermaid +%Normal flow +sequenceDiagram + participant User + participant DATABASE + participant buffer manager + participant buffer orch + participant SAI + participant SDK + User ->> DATABASE: Configure cable length and speed or admin-status + DATABASE ->> buffer manager: Update notification + alt Handle the case port is admin-down + rect rgb(255, 0, 255) + buffer manager ->> DATABASE: Remove lossless buffer PG + end + else + opt cable length or speed is not configured + buffer manager ->> DATABASE: Finish (need retry) + end + opt buffer profile doesn't exist? + buffer manager ->> buffer manager: Fetch headroom parameter according to cable length/speed + buffer manager ->> DATABASE: Create buffer profile and push into BUFFER_PROFILE + DATABASE ->>+ buffer orch: Update notification + buffer orch ->>+ SAI: sai_buffer_api->create_buffer_profile + SAI ->>- buffer orch: Finish + buffer orch ->>- DATABASE: Finish + end + buffer manager ->> DATABASE: Create buffer PG and push into BUFFER_PG for PG 3-4 + DATABASE ->>+ buffer orch: Update notification + loop for PG in [3, 4] + Note over buffer orch, SAI: attr.id = SAI_INGRESS_PRIORITY_GROUP_ATTR_BUFFER_PROFILE + Note over buffer orch, SAI: attr.value.oid = OID of corresponding buffer profile; + buffer orch ->>+ SAI: sai_buffer_api->set_ingress_priority_group_attribute + SAI ->>+ SDK: Set parameters of PG according to buffer profile + SDK ->>- SAI: Finish + SAI ->>- buffer orch: Finish + end + buffer orch ->>- DATABASE: Finish + end +``` + +```mermaid +%Create queue flow + +sequenceDiagram + participant User + participant DATABASE + participant buffer orch + participant SAI + participant SDK + User ->> DATABASE: Configure an entry in BUFFER_QUEUE + DATABASE ->>+ buffer orch: Update notification + buffer orch ->> buffer orch: Fetch the OID of buffer profile + loop for queue in list + Note over buffer orch, SAI: attr.id = SAI_QUEUE_ATTR_BUFFER_PROFILE_ID + Note over buffer orch, SAI: attr.value.oid = OID of corresponding buffer profile; + buffer orch ->>+ SAI: sai_queue_api->set_queue_attribute(queue, &attr) + SAI ->>+ SDK: Set parameters of the queue according to buffer profile + SDK ->>- SAI: Finish + SAI ->>- buffer orch: Finish + end + buffer orch ->>- DATABASE: Finish +``` + +```mermaid +%Create port profile list flow + +sequenceDiagram + participant User + participant DATABASE + participant buffer orch + participant SAI + participant SDK + User ->> DATABASE: Configure an entry in BUFFER_PORT_INGRESS/EGRESS_PROFILE_LIST + DATABASE ->>+ buffer orch: Update notification + loop for profile in profile_list + buffer orch ->> buffer orch: Fetch the OID of buffer profile + buffer orch ->> buffer orch: Insert the OID to oid_list + end + loop for queue in list + alt BUFFER_PORT_INGRESS_PROFILE_LIST + Note over buffer orch, SAI: attr.id = SAI_PORT_ATTR_QOS_INGRESS_BUFFER_PROFILE_LIST + else BUFFER_PORT_EGRESS_PROFILE_LIST + Note over buffer orch, SAI: attr.id = SAI_PORT_ATTR_QOS_EGRESS_BUFFER_PROFILE_LIST + end + Note over buffer orch, SAI: attr.value.oid = oid_list + buffer orch ->>+ SAI: sai_port_api-->set_port_attribute(port, &attr) + loop for each OID in oid_list + SAI ->>+ SDK: Set parameters of the port buffer pool according to buffer profile + SDK ->>- SAI: Finish + end + SAI ->>- buffer orch: Finish + end + buffer orch ->>- DATABASE: Finish +``` + +```mermaid +%Dynamic-port-init +sequenceDiagram + participant Kernel stack + participant port manager + participant ports orchagent + participant buffer manager + participant buffer manager internal data + participant SAI + participant CONFIG_DB + participant APPL_DB + participant STATE_DB + CONFIG_DB ->> port manager: A port is heard from CONFIG_DB + loop for each attribute of the port + port manager ->> Kernel stack: Set corresponding port attributes in kernel netdev + port manager ->> APPL_DB: Push the attribute into APPL_DB.PORT_TABLE + end + APPL_DB ->> ports orchagent: A port is heard from APPL_DB + ports orchagent ->> ports orchagent: Initialize the port (other steps omitted) + ports orchagent ->> SAI: query maximum number of queues + loop for each queue + ports orchagent ->> ports orchagent: Initialize the queue + end + ports orchagent ->> SAI: query maximum number of PGs + loop for each queue + ports orchagent ->> ports orchagent: Initialize the queue + end + ports orchagent ->> SAI: query maximum headroom size of the port + ports orchagent ->> STATE_DB: Push maximum numbers into STATE_DB.BUFFER_MAX_PARAM_TABLE + rect rgb(255, 0, 255) + STATE_DB ->> buffer manager: Maximum numbers of the port heard + buffer manager ->> buffer manager internal data: Generate ID maps of all queues and PGs + end +``` + +```mermaid +%Dynamic-original-flow +sequenceDiagram + participant User + participant CONFIG_DB + participant buffer manager + participant APPL_DB + participant buffer orch + participant SAI + participant SDK + User ->> CONFIG_DB: Shutdown the port + CONFIG_DB ->> buffer manager: Update notification + loop for each buffer PG object + buffer manager ->> APPL_DB: remove the object from APPL_DB + APPL_DB ->> buffer orch: Update notification + Note over buffer orch, SAI: attr.id = SAI_INGRESS_PRIORITY_GROUP_ATTR_BUFFER_PROFILE + Note over buffer orch, SAI: attr.value.oid = SAI_NULL_OBJECT_ID + buffer orch ->>+ SAI: sai_buffer_api->set_ingress_priority_group_attribute + SAI ->>+ SDK: Set the reserved size and headroom size to 0 + SDK ->>- SAI: Finish + SAI ->>- buffer orch: Finish + end +``` + +```mermaid +%Dynamic-new-flow +sequenceDiagram + participant User + participant CONFIG_DB + participant buffer manager + participant APPL_DB + User ->> CONFIG_DB: Shutdown the port + CONFIG_DB ->> buffer manager: Update notification + rect rgb(255, 0, 255) + opt zero profiles haven't been inserted to APPL_DB + buffer manager ->> APPL_DB: Insert zero pools and profiles into APPL_DB + end + loop for each buffer PG configured on the port + alt lossless + alt support removing PGs + buffer manager ->> APPL_DB: Remove the buffer item from BUFFER_PG table + else + buffer manager ->> APPL_DB: Apply zero profile to the PG in BUFFER_PG table + end + else + buffer manager ->> APPL_DB: Apply zero profile to the PG in BUFFER_PG table + end + end + opt (Not all PGs on which zero profile needs to be applied are configured) and (removing PGs is supported) + loop for each of the rest PGs + buffer manager ->> APPL_DB: Apply zero profile to the PG in BUFFER_PG table + end + end + loop for each buffer queue configured on the port + buffer manager ->> APPL_DB: Apply zero profile to the queue in BUFFER_QUEUE table + end + opt (Not all queues on which zero profile needs to be applied are configured) and (removing queues is supported) + loop for each of the rest PGs + buffer manager ->> APPL_DB: Apply zero profile to the queue in BUFFER_QUEUE table + end + end + buffer manager ->> APPL_DB: Set the profile of the buffer object to the zero buffer profile + loop For each profile_list in [BUFFER_PORT_INGRESS_PROFILE_LIST, BUFFER_PORT_EGRESS_PROFILE_LIST] + loop For each profile in profile_list + buffer manager ->> buffer manager: Fetch the zero profile of the pool referenced by the profile + buffer manager ->> buffer manager: Add the zero_profile to the list + end + buffer manager ->> APPL_DB: Update the profile list + end + end +``` diff --git a/doc/qos/reclaim-reserved-buffer.md b/doc/qos/reclaim-reserved-buffer.md new file mode 100644 index 0000000000..dd6e8d8791 --- /dev/null +++ b/doc/qos/reclaim-reserved-buffer.md @@ -0,0 +1,1044 @@ +# Reclaim reserved buffer # + +## 1 Table of Content ### + +### 1.1 Revision ### + +## 2 Scope ## + +This section describes the scope of this high-level design document in SONiC. + +## 3 Definitions/Abbreviations ## + +This section covers the abbreviation if any, used in this high-level design document and its definitions. + +| Term | Meaning | +|:--------:|:---------------------------------------------:| +| buffer object | The buffer configuration for priority groups, queues, ingress or egress profile lists | + +## 4 Overview ## + +Shared buffer is used to absorb traffic when a switch is under congestion. The larger the buffer, the better the performance in terms of congestion handling. + +On Mellanox platforms, buffers are reserved for each port, PG and queue. The size of shared buffer pool is equal to the total memory minus the accumulative reserved buffers. So we would like to reduce the reserved buffer as many as possible. One way to do that is to reclaim the buffers reserved for admin down ports. + +There are some admin down ports in user's scenario. There should not be any buffer reserved for admin down ports but currently there are by default. + +The purpose of this document is to provide a way to reclaim the buffer reserved for admin down ports and then increase the shared buffer pool size. + +## 5 Requirements ## + +The requirement is to reclaim the reserved buffer for admin down ports, including: + +- Buffer reserved SONiC configuration + - BUFFER_PG + - BUFFER_QUEUE + - BUFFER_PORT_INGRESS_PROFILE_LIST / BUFFER_PORT_EGRESS_PROFILE_LIST + +The reserved buffer is reclaimed when the port is admin down. The port can be admin down in the following two scenarios: + +1. The port is not used by deployment. In other words, the port is an INACTIVE port. +2. The port is temporarily shut down for maintaince, for example, to replace a broken cable or module attached on the port. + +The buffer reserved for the port is reclaimed in both scenario. + +## 6 Architecture Design ## + +### 6.1 The way to set reserved buffer to zero ### + +Currently, the reserved size of a buffer object is set to zero when it is removed from `BUFFER_PG` or `BUFFER_QUEUE` table. A way to reclaim buffer is + +- SONiC to remove objects of admin-down ports from `BUFFER_PG` and `BUFFER_QUEUE` tables +- SAI to set reserved size of buffer objects to zero on removing them. + +However, this creates inconsistency. Consider the following scenarios: + +1. System starting flow. SAI will not touch buffer objects if there is no buffer related configuration applied from SONiC, leaving them as the SDK default value. As a result, for any buffer object, + - There is no buffer configuration in SONiC + - The reserved buffer size in the ASIC is the SDK default value, some of which are not zero +2. System started, an existing buffer object is removed. SONiC notifies SAI by setting profile to `SAI_NULL_OBJECT_ID`. SAI will set the reserved size of corresponding buffer object to zero. As a result, for the buffer object, + - There is no buffer configuration in SONiC + - The reserved buffer size in the ASIC is zero + +For some of the buffer objects, the SDK default reserved size is not zero. This is to make sure the system works correctly without any buffer configuration in SONiC. +Now we have same SONiC configuration in 1 and 2 but different reserved buffer size in the ASIC. + +To make it clear and consistent, we need the following solution: + +- For lossless buffer priority groups, SONiC should remove them from SAI when the port is admin down. +- For other buffer objects: + - Introduce a new type of buffer profiles - `zero profile`. + - Apply the `zero profile` to the buffer objects in order to reclaim reserved buffer. +- What do the `zero profile`s look like (on Mellanox platform): + - For lossy priority groups: + - Create a `zero pool` with static threshold mode and 0 as buffer pool size. + - Create the `zero profile` with static threshold mode, and 0 for both `static_th` and `size`. + - For queues and buffer port ingress/egress profile list: + - Create a `zero profile` with dynamic threshold mode and 0 as `size`. +- When will the `zero profile`s be created: + - In `traditional buffer model`: + - `zero profile`s will be created if there are unused ports during deployment. + - It's user's responsibility to create `zero profile` if he/she disables a port on-the-fly. + - In `dynamic buffer model`: + - `zero profile`s will be created once at least 1 unused port exists. + - `zero profile`s will be pushed into `APPL_DB` only. +- SAI should configure buffer objects to: + - SDK default value, if there is no buffer profile configured for the object in SONiC when SAI is starting. + - SDK default value, if a buffer object is removed from SONiC + - Zero, only if the object is configured with an `zero profile` + +To achieve it the following steps will be taken: + +1. A series of `zero_profile`s should be defined for ingress/egress and lossless/lossy traffic. +2. Currently, there is no buffer object configured on admin-down ports. The `zero_profile` should be configured explicitly on admin-down ports. +3. Database migrator is required to bridge the gap between the old and new system when a switch is upgraded. + +This can be implemented on a per-vendor basis. For vendors with zero buffer profiles provided in buffer template, we will go this way. Otherwise, the reserved buffer will be reclaimed by removing corresponding buffer objects. + +This is for both static and dynamic buffer model. + +## 7 Static buffer model ## + +In static buffer model, buffer manager is responsible for: + +- Create a buffer profile entry in `CONFIG_DB.BUFFER_PROFILE` table when the `speed`, `cable length` tuple occurs for the first time + + The parameters, including `xon`, `xoff`, `size`, `threshold` are looked up from `pg_profile_lookup.ini` with `speed` and `cable length` as the key. +- Create a buffer priority-group entry in `CONFIG_DB.BUFFER_PG` table. + +All other buffer related configuration will be provided by the user. + +### 7.1 Deploy the buffer configuration for a switch ### + +By default, the buffer configuration is applied during deployment of the switch. Buffer configuration will be applied on active ports only. A port with neighbor device defined in `minigraph` will be treated as an active port. + +To deploy the switch, the related information should be provided by user in `minigraph`. The information related to buffer configuration includes: + +- Device type which can be one of `ToRRouter`, `LeafRouter`, and `SpineRouter`. +- Speed of each active port +- Neighbor device of each active port +- Meta data of the neighbors of active ports, like `type`, which which can be one of `server`, `ToRRouter`, `LeafRouter`, and `SpineRouter`. + +The system will generate necessary items and push them into `CONFIG_DB`, which effectively configures buffer for the active ports. + +- `admin status` in `PORT` table + - `up` for active ports + - `down` for inactive ports +- `speed` in `PORT` table +- `cable length` in `CABLE_LENGTH` table + - `40m` for ports connected between a `LeafRouter` and a `ToRRouter` + - `300m` or `2000m` for ports connected between a `LeafRouter` and a `SpineRouter` + - `5m` otherwise +- Determine switch's topology according to switch's device type: + - `ToRRouter` - t0 + - `LeafRouter` - t1 +- Create the following items by rendering `buffer template` according to `hwsku` and `topo` + - Buffer pools in `BUFFER_POOL` table + - `ingress_lossless_pool` + - `ingress_lossy_pool`, only available in general SKU + - `egress_lossless_pool` + - `egress_lossy_pool` + - Buffer profiles in `BUFFER_PROFILE` table + - `ingress_lossless_profile`, for ingress lossless port buffer pool + - `ingress_lossy_profile`, for ingress lossy port buffer pool and priority group + - `egress_lossless_profile`, for egress lossless port buffer pool and queue + - `egress_lossy_profile`, for egress lossy port buffer pool + - `q_lossy_profile`, for egress lossy queue + - Zero buffer profiles in `BUFFER_PROFILE` table + - `ingress_lossless_zero_profile`, zero profile for ingress lossless traffic + - `ingress_lossy_zero_profile`, zero profile for ingress lossy traffic + - `egress_lossless_zero_profile`, zero profile for egress lossless traffic + - `egress_lossy_zero_profile`, zero profile for egress lossy traffic + - Buffer queue items in `BUFFER_QUEUE` table + - Buffer priority group items in `BUFFER_PG` table + - Buffer ingress and egress port profile list in `BUFFER_PORT_INGRESS_PROFILE_LIST` and `BUFFER_PORT_EGRESS_PROFILE_LIST` respectively. + +#### 7.1.1 Buffer template update for zero buffer profiles #### + +##### 7.1.1.1 Macro to generate inactive ports ##### + +The inactive ports list is generated by the following buffer template. + +These macros are defined in generic buffer template file `buffer_template.j2`. + +The following snippet of code is to generate `PORT_INACTIVE` which contains all the inactive ports. Only `PORT_INACTIVE` is none empty will the zero buffer profiles be generated in vendor specific template. + +They need to be moved from the middle of `buffer_template.j2` to the place just before the vendor specific template is imported, so that the vendor specific template has access to `PORT_INACTIVE`. + +```json +{%- set PORT_ALL = [] %} + +{%- if PORT is not defined %} + {%- if defs.generate_port_lists(PORT_ALL) %} {% endif %} +{%- else %} + {%- for port in PORT %} + {%- if PORT_ALL.append(port) %}{%- endif %} + {%- endfor %} +{%- endif %} + +{%- set PORT_ACTIVE = [] %} +{%- set PORT_INACTIVE = [] %} +{%- if DEVICE_NEIGHBOR is not defined %} + {%- set PORT_ACTIVE = PORT_ALL %} +{%- else %} + {%- for port in PORT_ALL %} + {%- if port in DEVICE_NEIGHBOR.keys() %} + {%- if PORT_ACTIVE.append(port) %}{%- endif %} + {%- else %} + {%- if PORT_INACTIVE.append(port) %}{%- endif %} + {%- endif %} + {%- endfor %} +{%- endif %} + +{# Import default values from device HWSKU folder #} +{%- import 'buffers_defaults_%s.j2' % filename_postfix as defs with context %} +``` + +The vairable `port_names_inactive` also need to be generated by the following snipped of code. + +```json +{%- set port_names_list_inactive = [] %} +{%- for port in PORT_INACTIVE %} + {%- if port_names_list_inactive.append(port) %}{%- endif %} +{%- endfor %} +{%- set port_names_inactive = port_names_list_inactive | join(',') %} +``` + +##### 7.1.1.2 Macro to generate buffer pool and profiles ##### + +Zero buffer profiles should be defined for ingress/egress and lossless/lossy traffic in the buffer template. To achieve that the macro `generate_buffer_pool_and_profiles` needs to be updated. + +This macro is defined in vendor specific buffer template files. + +```json +{%- macro generate_buffer_pool_and_profiles() %} + "BUFFER_POOL": { +{%- if dynamic_mode is not defined and PORT_INACTIVE is defined and PORT_INACTIVE|length > 0 %} + "ingress_zero_pool" : { + "mode": "static", + "type": "ingress", + "size": "0" + }, +{%- endif %} + "ingress_lossless_pool": { + {%- if dynamic_mode is not defined %} + "size": "{{ ingress_lossless_pool_size }}", + {%- endif %} + "type": "ingress", + "mode": "dynamic" + }, + "ingress_lossy_pool": { + {%- if dynamic_mode is not defined %} + "size": "{{ ingress_lossy_pool_size }}", + {%- endif %} + "type": "ingress", + "mode": "dynamic" + }, + "egress_lossless_pool": { + "size": "{{ egress_lossless_pool_size }}", + "type": "egress", + "mode": "dynamic" + }, + "egress_lossy_pool": { + {%- if dynamic_mode is not defined %} + "size": "{{ egress_lossy_pool_size }}", + {%- endif %} + "type": "egress", + "mode": "dynamic" + } + }, + "BUFFER_PROFILE": { +{%- if dynamic_mode is not defined and PORT_INACTIVE is defined and PORT_INACTIVE|length > 0 %} + "ingress_lossy_pg_zero_profile" : { + "pool":"[BUFFER_POOL|ingress_zero_pool]", + "size":"0", + "static_th":"0" + }, + "ingress_lossless_zero_profile" : { + "pool":"[BUFFER_POOL|ingress_lossless_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "ingress_lossy_zero_profile" : { + "pool":"[BUFFER_POOL|ingress_lossy_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "egress_lossless_zero_profile" : { + "pool":"[BUFFER_POOL|egress_lossless_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "egress_lossy_zero_profile" : { + "pool":"[BUFFER_POOL|egress_lossy_pool]", + "size":"0", + "dynamic_ty":"-8" + }, +{%- endif %} + "ingress_lossless_profile": { + "pool":"[BUFFER_POOL|ingress_lossless_pool]", + "size":"0", + "dynamic_th":"7" + }, + "ingress_lossy_profile": { + "pool":"[BUFFER_POOL|ingress_lossy_pool]", + "size":"0", + "dynamic_th":"3" + }, + "egress_lossless_profile": { + "pool":"[BUFFER_POOL|egress_lossless_pool]", + "size":"0", + "dynamic_th":"7" + }, + "egress_lossy_profile": { + "pool":"[BUFFER_POOL|egress_lossy_pool]", + "size":"9216", + "dynamic_th":"7" + }, + "q_lossy_profile": { + "pool":"[BUFFER_POOL|egress_lossy_pool]", + "size":"0", + "dynamic_th":"3" + } + }, +{%- endmacro %} +``` + +##### 7.1.1.3 Macro to apply zero buffer profiles to inactive ports ##### + +The zero profiles should be configured explicitly on admin-down ports by the following buffer template. + +Originally, the macros to generate `BUFFER_QUEUE`, `BUFFER_PG`, `BUFFER_PORT_INGRESS_PROFILE_LIST` and `BUFFER_PORT_EGRESS_PROFILE_LIST` take only one argument which is `port_names_active`. Now that `zero profile`s also need to be applied on inactive ports, the macros need to be extended to support inactive ports as an argument. + +To tolerance the vendors who do not support the additional argument, both the original version and the extended version should be supported in generic buffer template. + +- `generate_profile_lists_with_inactive_ports` and `generate_profile_lists` +- `generate_pg_profiles_with_inactive_ports` and `generate_pg_profils` +- `generate_queue_buffers_with_inactive_ports` and `generate_queue_buffers` + +These macros are defined in vendor specific buffer template files. + +generate_profile_lists_with_inactive_ports: + +```json +{%- macro generate_profile_lists_with_inactive_ports(port_names_active, port_names_inactive) %} + "BUFFER_PORT_INGRESS_PROFILE_LIST": { +{% for port in port_names_active.split(',') %} + "{{ port }}": { + "profile_list" : "[BUFFER_PROFILE|ingress_lossless_profile],[BUFFER_PROFILE|ingress_lossy_profile]" + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% if port_names_inactive|length > 0 %} +, +{% for port in port_names_inactive.split(',') %} + "{{ port }}": { +{% if dynamic_mode is defined %} + "profile_list" : "[BUFFER_PROFILE|ingress_lossless_profile],[BUFFER_PROFILE|ingress_lossy_profile]" +{% else %} + "profile_list" : "[BUFFER_PROFILE|ingress_lossless_zero_profile],[BUFFER_PROFILE|ingress_lossy_zero_profile]" +{% endif %} + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% endif %} + }, + "BUFFER_PORT_EGRESS_PROFILE_LIST": { +{% for port in port_names_active.split(',') %} + "{{ port }}": { + "profile_list" : "[BUFFER_PROFILE|egress_lossless_profile],[BUFFER_PROFILE|egress_lossy_profile]" + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% if port_names_inactive|length > 0 %} +, +{% for port in port_names_inactive.split(',') %} + "{{ port }}": { +{% if dynamic_mode is defined %} + "profile_list" : "[BUFFER_PROFILE|egress_lossless_profile],[BUFFER_PROFILE|egress_lossy_profile]" +{% else %} + "profile_list" : "[BUFFER_PROFILE|egress_lossless_zero_profile],[BUFFER_PROFILE|egress_lossy_zero_profile]" +{% endif %} + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% endif %} + } +{%- endmacro %} +``` + +generate_pg_profiles_with_inactive_ports: + +```json +{%- macro generate_pg_profiles(port_names_active, port_names_inactive) %} + "BUFFER_PG": { +{% for port in port_names_active.split(',') %} +{% if dynamic_mode is defined %} + "{{ port }}|3-4": { + "profile" : "NULL" + }, +{% endif %} + "{{ port }}|0": { + "profile" : "[BUFFER_PROFILE|ingress_lossy_profile]" + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% if port_names_inactive|length > 0 %} +{%- for port in port_names_inactive.split(',') %} + {%- if loop.first -%},{%- endif -%} +{% if dynamic_mode is defined %} + "{{ port }}|3-4": { + "profile" : "NULL" + }, +{% endif %} + "{{ port }}|0": { +{% if dynamic_mode is defined %} + "profile" : "[BUFFER_PROFILE|ingress_lossy_profile]" +{% else %} + "profile" : "[BUFFER_PROFILE|ingress_lossy_pg_zero_profile]" +{% endif %} + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% endif %} + } +{%- endmacro %} +``` + +generate_queue_buffers_with_inactive_ports: + +```json +{%- macro generate_queue_buffers(port_names_active, port_names_inactive) %} + "BUFFER_QUEUE": { +{% for port in port_names_active.split(',') %} + "{{ port }}|3-4": { + "profile" : "[BUFFER_PROFILE|egress_lossless_profile]" + }, +{% endfor %} +{% for port in port_names_active.split(',') %} + "{{ port }}|0-2": { + "profile" : "[BUFFER_PROFILE|q_lossy_profile]" + }, +{% endfor %} +{% for port in port_names_active.split(',') %} + "{{ port }}|5-6": { + "profile" : "[BUFFER_PROFILE|q_lossy_profile]" + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% if port_names_inactive|length > 0 %} +, +{% if dynamic_mode is defined %} +{% for port in port_names_inactive.split(',') %} + "{{ port }}|3-4": { + "profile" : "[BUFFER_PROFILE|egress_lossless_profile]" + }, +{% endfor %} +{% for port in port_names_inactive.split(',') %} + "{{ port }}|0-2": { + "profile" : "[BUFFER_PROFILE|q_lossy_profile]" + }, +{% endfor %} +{% for port in port_names_inactive.split(',') %} + "{{ port }}|5-6": { + "profile" : "[BUFFER_PROFILE|q_lossy_profile]" + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% else %} +{% for port in port_names_inactive.split(',') %} + "{{ port }}|3-4": { + "profile" : "[BUFFER_PROFILE|egress_lossless_zero_profile]" + }, +{% endfor %} +{% for port in port_names_inactive.split(',') %} + "{{ port }}|0-2": { + "profile" : "[BUFFER_PROFILE|egress_lossy_zero_profile]" + }, +{% endfor %} +{% for port in port_names_inactive.split(',') %} + "{{ port }}|5-6": { + "profile" : "[BUFFER_PROFILE|egress_lossy_zero_profile]" + }{% if not loop.last %},{% endif %} + +{% endfor %} +{% endif %} +{% endif %} + } +{%- endmacro %} +``` + +Assume port `Ethernet0` is admin down, an example is: + +```json +{ + "BUFFER_PG" : { + "Ethernet0|0" : { + "profile": "[BUFFER_PROFILE|ingress_lossy_pg_zero_profile]" + } + }, + "BUFFER_QUEUE" : { + "Ethernet0|0-2" : { + "profile": "[BUFFER_PROFILE|egress_lossy_zero_profile]" + } + }, + "BUFFER_QUEUE" : { + "Ethernet0|3-4" : { + "profile": "[BUFFER_PROFILE|egress_lossless_zero_profile]" + } + }, + "BUFFER_QUEUE" : { + "Ethernet0|5-6" : { + "profile": "[BUFFER_PROFILE|egress_lossy_zero_profile]" + } + }, + "BUFFER_PORT_INGRESS_PROFILE_LIST" : { + "Ethernet0" : { + "profile_list" : "[BUFFER_PROFILE|ingress_lossless_zero_profile],[BUFFER_PROFILE|ingress_lossy_zero_profile]" + } + }, + "BUFFER_PORT_EGRESS_PROFILE_LIST" : { + "Ethernet0" : { + "profile_list" : "[BUFFER_PROFILE|egress_lossless_zero_profile],[BUFFER_PROFILE|egress_lossy_zero_profile]" + } + } +} +``` + +#### 7.1.2 The flow to deploy a switch #### + +In the flow, the steps in pink are needed to be implemented. + +- INACTIVE_PORT set needs to be generated. Currently, only ACTIVE_PORT set is generated. +- Zero profiles need to be generated if the INACTIVE_PORT set is not empty. +- Zero profiles need to be applied to inactive ports. + +All other steps exist. + +![Flow](reclaim-reserved-buffer-images/deploy.jpg "Figure: Deploy flow of static buffer model") + +#### 7.1.3 The flow to handle `speed`, `cable length` and `admin status` of the port #### + +In the flow, buffer manager will test port's admin status. It will remove lossless priority groups or apply zero profile on it and then skip the rest part if it's admin-down. This is the pink area in the flow chart, which needs to be implemented. + +All other steps exist. + +![Flow](reclaim-reserved-buffer-images/normal.jpg "Figure: Normal flow of static buffer model") + +#### 7.1.4 The flow to handle `BUFFER_QUEUE` table add entry #### + +This is an existing flow. No code change is required. + +![Flow](reclaim-reserved-buffer-images/create-queue.jpg "Figure: create queue") + +#### 7.1.5 The flow to handle `BUFFER_PORT_INGRESS_PROFILE_LIST`, and `BUFFER_PORT_EGRESS_PROFILE_LIST` table add entry #### + +This is an existing flow. No code change is required. + +![Flow](reclaim-reserved-buffer-images/create-port-profile-list.jpg "Figure: create port profile list") + +### 7.2 Enable a port and configure buffer for it ### + +The following buffer profiles should be created before enabling a port and configuring buffer for it. By default, they are defined in `buffer template` and will be applied when the `minigraph` is reloaded. + +- `ingress_lossless_profile` +- `ingress_lossy_profile` +- `egress_lossless_profile` +- `egress_lossy_profile` +- `q_lossy_profile` + +The following items need to be configured to enable a port and configure buffer for it. By default, they are defined in `buffer template` or `minigraph` and will be applied when the the `minigraph` is reloaded. + +In case the user wants to re-enable a disabled port, he needs to configure the following items manually. + +- Set `admin status` to `up` for the port by executing command `config interface startup `. +- Add following entries in the corresponding buffer table for the port. + + Currently there is not any commands or other UI that user can use to add the following items. So the only way for a user to configure them is to compose a json file containing all the items and then to execute `sonic-cfggen -j --write-to-db`. We will give an example of each items. + - PG 0 in `CONFIG_DB.BUFFER_PG` table as a lossy priority group with `BUFFER_PROFILE|ingress_lossy_profile` as the `profile` + + An example of PG items for port `Ethernet0`: + + ```json + { + "BUFFER_PG": { + "Ethernet0|0": { + "profile": "[BUFFER_PROFILE|ingress_lossy_profile]" + } + } + } + ``` + + - Queues `0-2`, `5-6` in `CONFIG_DB.BUFFER_QUEUE` as lossy queues with `BUFFER_PROFILE|q_lossy_profile` as the `profile` + - Queues `3-4` in `CONFIG_DB.BUFFER_QUEUE` as lossless queues with `BUFFER_PROFILE|egress_lossless_profile` as the `profile` + + An example of queue items for port `Ethernet0`: + + ```json + { + "BUFFER_QUEUE": { + "Ethernet0|0-2": { + "profile": "[BUFFER_PROFILE|q_lossy_profile]" + }, + "Ethernet0|3-4": { + "profile": "[BUFFER_PROFILE|egress_lossless_profile]" + }, + "Ethernet0|5-6": { + "profile": "[BUFFER_PROFILE|q_lossy_profile]" + } + } + } + ``` + + - An item in `CONFIG_DB.BUFFER_PORT_INGRESS_PROFILE_LIST` table with the following profiles in the `profile_list` + - `BUFFER_PROFILE|ingress_lossless_profile` for `ingress_lossless_pool` + - `BUFFER_PROFILE|ingress_lossy_profile` for `ingress_lossy_pool` if the pool exists + + An example of ingress profile list item for port `Ethernet0` for single ingress pool mode: + + ```json + { + "BUFFER_PORT_INGRESS_PROFILE_LIST": { + "Ethernet0": { + "profile_list": "[BUFFER_PROFILE|ingress_lossless_profile]" + } + } + } + ``` + + - An item in `CONFIG_DB.BUFFER_PORT_EGRESS_PROFILE_LIST` table with the following profiles in the `profile_list` + - `BUFFER_PROFILE|egress_lossless_profile` for `egress_lossless_pool` + - `BUFFER_PROFILE|egress_lossy_profile` for `egress_lossy_pool` + + An example of egress profile list item for port `Ethernet0`: + + ```json + { + "BUFFER_PORT_EGRESS_PROFILE_LIST": { + "Ethernet0": { + "profile_list": "[BUFFER_PROFILE|egress_lossless_profile],[BUFFER_PROFILE|egress_lossy_profile]" + } + } + } + ``` + +- Recalculate the sizes of shared buffer pool and shared headroom pool and configure them. + +After the flow has been successfully executed: + +- A lossless profile with name convention `pg_lossless___profile` will be created and inserted into `BUFFER_PROFILE` table. +- The priority group `3-4` will be created and inserted into `BUFFER_PG` table, referencing the buffer profile. +- Priority group `3` and `4` is enabled with corresponding headroom parameters (`headroom size`, `xon`, `xoff`) and alpha on the port. +- Priority group `0` is enabled with pipeline latency as `headroom size`. +- Reserved sizes and alpha of queue and port ingress/egress buffer pool are set according to the buffer profile referenced by the corresponding buffer tables. +- Sizes of shared buffer pool and shared headroom pool are set according to configuration. + +The flows are the same as those of deploy a switch. + +### 7.3 Disable a port and reclaim the buffer reserved for the port after a switch was deployed ### + +The user needs to: + +- Set the admin status of the port to `down` via executing command `config interface shutdown `. +- Remove the lossless PG of the port from `CONFIG_DB` and set the following entries to `zero profile`. By default, they are enforced by `buffer template`. + + In case the user enabled a port and then decides to disable it, the following entries are in the system and the user has to remove them manually. + + There is no way for a user to remove items from the `CONFIG_DB` on the fly. So the only way for a user to do it is to remove the items from `config_db.json` and then to execute `config reload`. Examples of items in each of the following tables are provided in the previous chapter. + + Items need to be set to `zero profile`: + - entries of admin-down ports in table `BUFFER_QUEUE`, `BUFFER_PORT_INGRESS_PROFILE_LIST` and `BUFFER_PORT_EGRESS_PROFILE_LIST` + - lossy priority-groups of admin-down ports in table `BUFFER_PG` +- Calculate the sizes of shared buffer pool and shared headroom pool and then reconfigure them in `BUFFER_POOL` table in `CONFIG_DB`. + +After the flow has been successfully executed: + +- The entry of lossless priority-group `3-4` of the port is removed from `BUFFER_PG` table in `CONFIG_DB` +- Reserved size and headroom size of port's priority group `0`, `3` and `4` are zero. +- Reserved size of queues and port buffer pools of the port are zero. +- Sizes of shared buffer pool and shared headroom pool are updated accordingly. + +The flows of this are the same as those of deploy a switch. + +### 7.4 Summary: flows need to be implemented to support reclaiming reserved buffer of admin down ports ### + +According to the flows described in above sections, the following flows need to be implemented: + +1. Buffer template to generate zero buffer profiles and apply them if there are inactive ports. +2. Buffer manager to test port's admin status before creating lossless priority group for the port. +3. Buffer manager to remove port's lossless priority group once the port's admin status is changed to down. +4. Buffer orch to handle `BUFFER_PG` removing. + +## 8 Dynamic buffer model ## + +In dynamic buffer model + +- Normal profiles will be configured on all ports in `CONFIG_DB`. + + Reserved buffer of admin down ports is reclaimed by applying zero profiles on PGs, queues and ingress/egress profile lists on the port in `APPL_DB`. `CONFIG_DB` will not be touched during the procedure. +- Zero profiles will be: + - Provided as a json file on a per-vendor or per-platform basis. + - Loaded to buffer manager via CLI option `-z`. + - Applied to `APPL_DB` when all ports is admin up and one port is about to be shut down. + + All zero profiles and zero pools will be applied into `APPL_DB` in the same order as they are defined in the json file. So vendor should guarantee the order satisfies the dependency, which means the zero pools should be defined ahead of zero profiles. + - Removed from `APPL_DB` when only one port is admin down and is about to be started up. + - For each buffer pool, there should be and only be one zero profiles referencing the buffer pool. +- When a port is shut down/started up, buffer manager will apply zero/normal profiles on all its buffer objects in `APPL_DB` respectively. +- For queues and priority groups, zero profiles will be applied on: + - Configured buffer items. For each queues and priority groups configured in `CONFIG_DB`, corresponding zero profiles will be applied on it. + - Supported-but-not-configured buffer items. In case any queue or priority group is supported by the port but not configured in `CONFIG_DB`, zero profile will be applied on it, which is achieved by generating an extra buffer item in `APPL_DB`. + + This is to guarantee the buffer reserved for any supported-but-not-configured queue or priority group will be reclaimed correctly. + + For example, + - A platform supports 16 queues. + - Queues `0-2`, and `5-6` are configured as lossy. + - Queues `3-4` are configured as lossless. + + The zero profiles will be applied on `0-2`, `3-4`, and `5-6`. As queues `7-15` are also supported but not configured, zero profiles will be applied on them by adding an `BUFFER_QUEUE_TABLE||7-15` item into `APPL_DB`. + + When the admin-down port is started up, such items will be removed from the system. In case removing queues is not supported on a platform, zero profiles will not be applied on `7-15`. After that, the reserved buffer size of these queues will be restored to the SDK default value. The reserved buffer can not be completely reclaimed if the SDK default value is not zero and removing items is not supported on the platform. + - Specific items. A set of IDs of queues or priority groups should be specific in the json file. + - The zero profiles will be applied on a specific set of IDs regardless of which queues/priority groups are configured. + - Buffer items on queues/priority groups that are supported on the port but not configured will be removed. + + For example, + - Priority group `0` is specified to apply zero profile on. + - Priority group `0` is configured as lossy. + - Priority group `3-4` is configured as lossless. + + The zero profile will be applied on `0` and the priority group `3-4` will be removed. +- The number of PGs and queues supported on the port is pushed to `STATE_DB` by ports orchagent and learned by buffer manager. +- In case the zero profiles are not provided, reserved buffer will be reclaimed by removing PGs and queues. + + If removing is not allowed, reserved buffer will not be reclaimed. + +### 8.1 STATE_DB enhancement ### + +The maximum number of queues and priority gourps of each port are pushed into `BUFFER_MAX_PARAM_TABLE` table in `STATE_DB` when `ports orchagent` starts. + +Currently, there is only one field `max_headroom_size` in the table. The fields `max_priority_groups` and `max_queues` will be added to the table, representing maximum number of priority groups and queues on the port respectively. + +```schema + key = BUFFER_MAX_PARAM_TABLE| ; when key is global, it should contain mmu_size. + ; when key is port name, it should contain max_headroom_size, max_priority_groups, and max_queues. + ; The following keys have been defined in the table currently. + mmu_size = 1*9DIGIT ; Total avaiable memory a buffer pool can occupy + max_headroom_size = 1*6DIGIT ; Optional. The maxinum value of headroom size a physical port can have. + ; The accumulative headroom size of a port should be less than this threshold. + ; Not providing this field means no such limitation for the ASIC. + ; The following keys will be introduced in the table for reclaiming reserved buffer + max_priority_groups = 2*6DIGIT ; The maxinum number of priority groups supported on the port. + max_queues = 2*6DIGIT ; The maxinum number of queues supported on the port. + +``` + +### 8.2 Handle the buffer template of zero profiles ### + +#### 8.2.1 How the zero profiles are loaded #### + +The zero profiles are defined in buffer templates which is rendered to json when swss docker is created and loaded to `buffer manager` when the daemon is starting via CLI options. + +Currently, the CLI options to start the dynamic buffer manager includes + +```CLI +Usage: buffermgrd <-l pg_lookup.ini|-a asic_table.json [-p peripheral_table.json]> + -l pg_lookup.ini: PG profile look up table file (mandatory for static mode) + format: csv + values: 'speed, cable, size, xon, xoff, dynamic_threshold, xon_offset' + -a asic_table.json: ASIC-specific parameters definition (mandatory for dynamic mode) + -p peripheral_table.json: Peripheral (eg. gearbox) parameters definition (mandatory for dynamic mode) +``` + +We will extend CLI options by adding `-z` which represents the json file containing the zero profiles: + +```CLI + -z zero_profiles.json: Zero profiles definition for reclaiming unused buffers (optional for dynamic mode) +``` + +The zero profiles will always not be inserted into `CONFIG_DB`. + +They will not be inserted into `APPL_DB` until at least one port is shut down. After that, if all ports are admin up, the zero profiles will be removed from `APPL_DB`. + +#### 8.2.2 The json file for buffer template #### + +##### 8.2.2.2 The structure of the json file ##### + +The json file contains a list of zero pools (if necessary) and profiles, which will be handled by buffer manager. Any zero pool should be defined ahead of zero profiles to make sure all the buffer pools have been parsed when any buffer profiles is being parsed. This is to meet the dependency. + +There is also an item containing control fields, including: + +- `pgs_to_apply_zero_profile`: In case zero profiles are not required to be applied on either all or configured priority groups, an ID map on which zero profiles should be applied can be specified on a per-platform basis in this field. +- `queues_to_apply_zero_profile`: Similar to `pgs_to_apply_zero_profile` but for queues. +- `ingress_zero_profile`: The ingress zero profile, in case the vendor need to specify it explicitly. By default, the zero profile of each buffer pool is the profile in the list and referencing the pool. +- `egress_zero_profile`: The egress zero profile. It's similar as the ingress one but on egress side. +- `support_removing_buffer_items`: By default, it is `yes`. In this case, the normal profiles will be removed from the admin down port before applying the zero profiles on all priority groups or queues. + + In case removing is not supported by vendor, this field should be specified as `no`. In this case, the zero profiles will be applied on all configured priority groups and queues. + +##### 8.2.2.2 An example of buffer template of zero profiles ##### + +This is an example of buffer template of zero profiles. + +In the example, the egress_zero_profile is not specified, so the buffer profile `egress_lossy_zero_profile` and `egress_lossless_zero_profile` will be used. + +```json +[ + { + "BUFFER_POOL_TABLE:ingress_zero_pool": { + "mode": "static", + "type": "ingress", + "size": "0" + }, + "OP": "SET" + }, + { + "BUFFER_PROFILE_TABLE:ingress_lossy_pg_zero_profile" : { + "pool":"[BUFFER_POOL_TABLE:ingress_zero_pool]", + "size":"0", + "static_th":"0" + }, + "OP": "SET" + }, + { + "BUFFER_PROFILE_TABLE:ingress_lossy_zero_profile" : { + "pool":"[BUFFER_POOL_TABLE:ingress_lossy_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "OP": "SET" + }, + { + "BUFFER_PROFILE_TABLE:ingress_lossless_zero_profile" : { + "pool":"[BUFFER_POOL_TABLE:ingress_lossless_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "OP": "SET" + }, + { + "BUFFER_PROFILE_TABLE:egress_lossy_zero_profile" : { + "pool":"[BUFFER_POOL_TABLE:egress_lossy_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "OP": "SET" + }, + { + "BUFFER_PROFILE_TABLE:egress_lossless_zero_profile" : { + "pool":"[BUFFER_POOL_TABLE:egress_lossless_pool]", + "size":"0", + "dynamic_th":"-8" + }, + "OP": "SET" + }, + { + "control_fields" : { + "pgs_to_apply_zero_profile":"0", + "ingress_zero_profile":"[BUFFER_PROFILE_TABLE:ingress_lossy_pg_zero_profile]" + }, + "OP": "SET" + } +] +``` + +### 8.3 Generate maximum number of queues and priority groups for each port ### + +The ports orchagent fetches the maximum numbers of queues and priority groups of a port via SAI interfaces when it is creating the port. After that, it will push the maximum numbers to `STATE_DB.BUFFER_MAX_PARAM_TABLE`. + +The buffer manager listens to the tables. Once it has heard the maximum numbers, it will generage the IDs of all queues and priority groups and store them into its internal data structure. The IDs of all queues and priority groups can be used to apply zero profiles when a port is admin down. In case the buffer manager hasn't heard the maximum numbers of queues or priority groups when a port is shut down, it will mark the port as `pending apply zero profiles` and retry later. + +The flow of handling maximum numbers is: + +![Flow](reclaim-reserved-buffer-images/dynamic-port-init.jpg "Figure: Port init flow (focusing on maximum numbers handling)") + +### 8.4 Handle the port admin up/down ### + +Currently, when a port is shut down, the buffer reserved for admin-down ports is reclaimed by removing the objects from `APPL_DB` in dynamic buffer model: + +1. `buffer manager` removes the item related to the port from buffer tables from `APPL_DB` +2. `buffer orch` notifies SAI to remove object +3. `SAI` set reserved sizes to zero. + +![Flow](reclaim-reserved-buffer-images/dynamic-original.jpg "Figure: Reclaim reserved buffer in dynamic model - original flow") + +Now that we have new way to do it, reserved buffer will be reclaimed by: + +- Removing lossless PGs. +- Setting zero profile to corresponding buffer objects, including buffer PGs, buffer queues, and buffer profile lists. + +The new flow for admin down handling is: + +![Flow](reclaim-reserved-buffer-images/dynamic-new.jpg "Figure: Reclaim reserved buffer in dynamic model - new flow") + +### 8.5 Add a priority group or queue to an admin-down port ### + +Currently, only adding a priority group to an admin-down port is supported. It will be extended to support adding priority groups. This flow varies among different scenarios. + +#### 8.5.1 Add a priority group or queue to an admin-down port during initialization #### + +1. If `pgs_to_apply_zero_profile` or `queues_to_apply_zero_profile` is not empty: + + - Taking priority group as an example, if the items to be configured is not the same as `pgs_to_apply_zero_profile`, remove the item from `APPL_DB`. This is to notify `orchagent` to add the item to the ready list. + +2. Otherwise: + + - If there is zero profile defined, apply the zero profile on the priority group in `APPL_DB`. + - Otherwise, remove the item from `APPL_DB`. This is to notify `orchagent` to add the item to the ready list. + +#### 8.5.2 Add a priority group or queue to an admin-down port after initialization #### + +Otherwise, if the system has been initialized, and `pgs_to_apply_zero_profile` or `queues_to_apply_zero_profile` is not empty, which means zero profiles have been applied on all configured queues or priority groups and configured but not supported objects, extra steps should be taken. + +Let's take queues as an example to explain the principle. Suppose the queues to be added/set is `N`, currently the configured queues is `M` and the supported but not configured queues is `S`, the idea is: + +1. The union of `N`, `M` and `S` should be equal to the set of all supported queues and the intersection of any two set among `N`, `M` and `S` should be empty. + + Eg. currently, queues `3-4` is configured on port `Ethernet0`, queues `0-7` is supported and a user wants to configure queues `6` on top of them. In this case, the supported but not configured queues should be [`0-2`, `5-7`]. + +2. Iterate the list of supported but not configured queues, and find the slice which equals or contains the queues the user wants to configure. + + In the example, the slice is `5-7` because `5-7` contains `6`. + +3. Remove the slice from the supported but not configured queues list. + + In the example, the slice will be split to `5-7` will be removed from the list. + +4. If the slice equals the queues to be configured, procedure finished. In this case, no need to apply zero profile again given that it has been applied. + +5. Otherwise, meaning the slice contains the queues to be configured, split the slice into 2 or 3 children, + + - One is exactly the queues to be configured, like `6` in the example + - The rest are + - The queues whose ID is less than queues to be configured, like `5` + - And the queues whose ID is greater than queues to be configured, like `7` + - Either of above can be empty if the slice shares the upper or lower bound with the queues to be configured. At lease one bound differs otherwise the slice will equal the queues to be configured. + + Remove the slice from the `APPL_DB` as zero profile has been applied on it and reapply zero profiles on all (both) children into `APPL_DB`. + + In the above example, and `BUFFER_QUEUE_TABLE:Ethernet0:5-7` will be removed from `APPL_DB`, and items `BUFFER_QUEUE_TABLE:Ethernet0:5`, `BUFFER_QUEUE_TABLE:Ethernet0:6` and `BUFFER_QUEUE_TABLE:Ethernet0:7` will be reapplied to `APPL_DB`. + +Even from ASIC's perspective of view, there is no difference before step 5 and after it, this step must be done because if the system undergoes a warm reboot without step 5 done, it doesn't understand what items have been applied to `APPL_DB` and can introduce various items convering the same queues. In the above example, if there is no step 5, after warm reboot the buffer manager will apply `BUFFER_QUEUE_TABLE:Ethernet0:5`, `BUFFER_QUEUE_TABLE:Ethernet0:6` and `BUFFER_QUEUE_TABLE:Ethernet0:7` to `APPL_DB` with zero profiles. However, the item `BUFFER_QUEUE_TABLE:Ethernet0:5-7`, which was applied before warm reboot, is still there. + +In all other scenarios, saving new priority group or queue suffices. + +### 8.6 Remove a priority group or queue to an admin-down port ### + +If the `pgs_to_apply_zero_profile` or `queues_to_apply_zero_profile` is not empty, no further actions except for removing the items a user wants to remove from the internal data structure. This is because the reclaiming is done by applying zero profiles on designated priority groups or queues and the items to be removed has never been applied to `APPL_DB`. + +Otherwise, there are supported but not configured items on which zero profile has been applied. Now that some items which were configured are about to be removed, they should be added to supported but not configured items list and merged with an existing one it the list. There should always be one that is adjacent to the one to be removed and can be merged with. + +For example, the configured items are `0` and `3-4` and the supported but not configured items list is [`1-2`, `5-7`]. If a user wants to remove item `0`, the item will be merged with `1-2` and item `0-2` will be generated. + +## 9 SAI API ## + +There is no new SAI API or attribute introduced in this design. The SAI APIs and attributes referenced in this design are list below. + +### 9.1 Reclaim priority groups ### + +The SAI API `sai_buffer_api->set_ingress_priority_group_attribute` is used for reclaiming reserved buffer for priority groups. The arguments should be the following: + +```C + attr.id = SAI_INGRESS_PRIORITY_GROUP_ATTR_BUFFER_PROFILE; + attr.value.oid = OID of zero buffer profile on ingress; + sai_buffer_api->set_ingress_priority_group_attribute(pg_id, &attr); // pg_id is the SAI object ID of the priority group +``` + +After this SAI API called, the reserved buffer of the priority group indicated by pg_id will be set to zero. + +### 9.2 Reclaim queues ### + +The SAI API `sai_queue_api->set_queue_attribute` is used for reclaiming reservied buffer for queues. The arguments should be the following: + +```C + attr.id = SAI_QUEUE_ATTR_BUFFER_PROFILE_ID; + attr.value.oid = OID of zero buffer profile on egress; + sai_queue_api->set_queue_attribute(queue_id, &attr); // queue_id is the SAI object ID of the queue +``` + +After this SAI API called, the reserved buffer of the queue indicated by pg_id will be set to zero. + +### 9.3 Reclaim port reserved buffers ### + +The SAI API `sai_port_api->set_port_attribute` is used for reclaiming reserved buffer for port buffer pools. The arguments should be the following: + +```C + // Reclaim reserved buffer on ingress side + attr.id = SAI_PORT_ATTR_QOS_INGRESS_BUFFER_PROFILE_LIST + attr.value.objlist.list = [OID of zero profile for each ingress pool] + attr.value.objlist.count = 2; + sai_port_api->set_port_attribute(port.m_port_id, &attr); + + // Reclaim reserved buffer on egress side + attr.id = SAI_PORT_ATTR_QOS_EGRESS_BUFFER_PROFILE_LIST + attr.value.objlist.list = [OID of zero profile for each egress pool] + attr.value.objlist.count = 2; + sai_port_api->set_port_attribute(port.m_port_id, &attr); +``` + +## 10 Configuration and management ## + +N/A + +### 10.1 CLI/YANG model Enhancements ### + +N/A + +### 10.2 Config DB Enhancements ### + +N/A + +### 10.3 Database migrator ### + +For any admin down port, if the port's buffer configuration aligns with the default configuration which is: + +- There is no lossless PG or a lossless PG according to the port's speed and cable length. +- There is no lossy PG, queues and buffer ingress/egress profile lists. + +The buffer configuration will be configured on the port: + +- For dynamic buffer model, default normal profiles will be configured on PGs, queues, and ingress/egress profile lists. +- For traditional buffer model, corresponding zero profiles will be configured on PGs, queues, and ingress/egress profile lists. + + The zero buffer pools and profiles will also be configured in this case. + +## 11 Warmboot and Fastboot Design Impact ## + +No impact on warm/fast boot. + +## 12 Restrictions/Limitations ## + +N/A + +## 13 Testing Requirements/Design ## + +### 13.1 Unit Test cases ### + +#### 13.1.1 Shutdown / startup a port #### + +Lossless PGs should be removed when a port is shutdown. + +1. Choose an admin-up port to test +2. Shutdown the port +3. Check whether the zero profiles have been applied on PGs and queues in the `APPL_DB` and `ASIC_DB` +4. Startup the port +5. Check whether the normal profiles have been applied on PGs and queues in the `APPL_DB` and `ASIC_DB` + +### 13.2 System Test cases ### + +#### 13.2.1 Shutdown / startup a port #### + +The zero profiles should be applied on PGs and queues when a port is shutdown. Sizes of shared headroom pool and shared buffer pool should be adjusted accordingly. + +1. Choose a port which is admin up to test +2. Shutdown the port +3. Check whether the zero profiles have been applied on PGs and queues in the `APPL_DB` and `ASIC_DB` +4. Adjust the sizes of shared headroom pool and shared buffer pool +5. Check whether the adjusted sizes are correct +6. Startup the port +7. Check whether the normal profiles have been applied on PGs and queues in the `APPL_DB` and `ASIC_DB` +8. Adjust the sizes of shared headroom pool and shared buffer pool +9. Check whether the adjusted sizes are correct + +## 14 Open/Action items - if any ## diff --git a/doc/qos/tunnel_dscp_remapping.md b/doc/qos/tunnel_dscp_remapping.md new file mode 100644 index 0000000000..c93e787c67 --- /dev/null +++ b/doc/qos/tunnel_dscp_remapping.md @@ -0,0 +1,324 @@ +# DSCP remapping for tunnel traffic + +## 1 Table of Content ### + +- [Revision](#11-revision) +- [Scope](#2-scope) +- [Definitions/Abbreviations](#3-definitionsabbreviations) +- [Overview](#4-overview) +- [Design](#5-design) + - [SWSS Schema](#51-swss-schema) + - [Define new table for mapping](#511-define-new-table-for-mapping) + - [Update existing TUNNEL table](#512-update-existing-tunnel-table) + - [Define new field for extra lossless queues](#513-define-new-field-for-extra-lossless-queues) + - [SAI attribute](#52-sai-attribute) + - [orchagent](#53-orchagent) +- [Test requirement](#6-test-requirement) +- [Open Questions](#7-open-questions) + +### 1.1 Revision ### +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:------------------:|-----------------------------------| +| 0.1 | | Bing Wang | Initial version | + + +## 2 Scope ## + +This document covers high level design of DSCP and TC remapping for tunnel traffic in SONiC. + +## 3 Definitions/Abbreviations ## + + +| Term | Meaning | +|:--------:|:---------------------------------------------:| +| PFC | Priority-based Flow Control | +| TC | Traffic class| +| DSCP| Differentiated Services Code Point | + +## 4 Overview + +In Dual-ToR scenario, PFC deadlock can happen if two servers are congested at same time and start sending PFC pause frames to both upper ToR and lower ToR. It is because the south bound traffic from T1 to standby ToR is bounced back to T1 via the same queue. Even when the block condition is resolved, the pause condition among T1 and both ToRs can stay forever and no traffic will go through. +

+Figure 1. Bounced back traffic deadloack +

+ +To avoid this scenario, the bounced-back traffic from standby ToR to T1 will be remapped into another queue. For example, the traffic flow is as below if we are going to remap traffic in queue 3 to queue 2. + +

+Figure 2. Bounced back traffic flow +

+ +When congestion happens on server, the traffic flow of PFC pause frames is as below diagram. + +

+Figure 3. Bounced back traffic flow with PFC pause +

+The current QoS map architecture allows for port-based selection of each QoS map. However, we are not able to override the port-based QoS map for tunnel traffic. +This design proposes a method to remapping DSCP and TC for tunnel traffic. + + +## 5 Design ## + +### 5.1 SWSS Schema +#### 5.1.1 Define new table for mapping +Update [qos_config.j2](https://github.com/Azure/sonic-buildimage/blob/master/files/build_templates/qos_config.j2) to generate 4 tables for remapping. Currently, the remapping is required in `dual-tor` scenario. So the tables are rendered into `config_db` only when `DEVICE_METADATA['localhost']['subtype'] = 'DualToR`. + +Please be noted that below config is to remap traffic in queue 3 to queue 2, and traffic in queue 4 to queue 6. +Before remapping to queue 2 and 6, both queues are required to be cleared. Hence the current `DSCP_TO_TC_MAP|AZURE` in [qos_config.j2](https://github.com/Azure/sonic-buildimage/blob/master/files/build_templates/qos_config.j2) is required to be updated. +* Table for decap + + DSCP_TO_TC_MAP for mapping DSCP to TC + + ```json + "DSCP_TO_TC_MAP": { + "AZURE_TUNNEL": { + "0" : "1", + "1" : "1", + "2" : "1", + "3" : "3", + "4" : "4", + "5" : "1", // Original map "5" : "2" + "6" : "1", + "7" : "1", + "8" : "0", + "9" : "1", + "10": "1", + "11": "1", + "12": "1", + "13": "1", + "14": "1", + "15": "1", + "16": "1", + "17": "1", + "18": "1", + "19": "1", + "20": "1", + "21": "1", + "22": "1", + "23": "1", + "24": "1", + "25": "1", + "26": "1", + "27": "1", + "28": "1", + "29": "1", + "30": "1", + "31": "1", + "32": "1", + "33": "1", + "34": "1", + "35": "1", + "36": "1", + "37": "1", + "38": "1", + "39": "1", + "40": "1", + "41": "1", + "42": "1", + "43": "1", + "44": "1", + "45": "1", + "46": "5", + "47": "1", + "48": "7", // Original map "48" : "6". + "49": "1", + "50": "1", + "51": "1", + "52": "1", + "53": "1", + "54": "1", + "55": "1", + "56": "1", + "57": "1", + "58": "1", + "59": "1", + "60": "1", + "61": "1", + "62": "1", + "63": "1" + } + ``` + + TC_TO_PRIORITY_GROUP_MAP for mappping TC to PG + + ```json + "TC_TO_PRIORITY_GROUP_MAP": { + "AZURE_TUNNEL": { + "0": "0", + "1": "0", + "2": "0", + "3": "2", // Original map "3" : "3" + "4": "6", // Original map "4" : "4" + "5": "0", + "6": "0", + "7": "0" // Original map "7" : "7" + } + ``` + +* Table for encap + + TC_TO_QUEUE_MAP for remapping queue + + ```json + "TC_TO_QUEUE_MAP": { + "AZURE_TUNNEL": { + "0": "0", + "1": "1", + "2": "1", // Original map "2" : "2" + "3": "2", // Original map "3" : "3" + "4": "6", // Original map "4" : "4" + "5": "5", + "6": "1", // Original map "6" : "6" + "7": "7" + } + ``` + + TC_TO_DSCP_MAP for rewriting DSCP. This map is newly added. + + ```json + "TC_TO_DSCP_MAP": { + "AZURE_TUNNEL": { + "0": "8", + "1": "0", + "2": "0", + "3": "2", + "4": "6", + "5": "46", + "6": "0", + "7": "48" + } + ``` + + To support the new table, a new YANG model `sonic-tc-dscp.yang` is required +#### 5.1.2 Update existing TUNNEL table +1. Change `dscp_mode` from `uniform` to `pipe` for TC remapping +2. Add TC remapping config if TC remapping is enabled + +```json + "TUNNEL": { + "MuxTunnel0": { + "dscp_mode": "pipe", + "dst_ip": "10.1.0.32", + "ecn_mode": "copy_from_outer", + "encap_ecn_mode": "standard", + "ttl_mode": "pipe", + "tunnel_type": "IPINIP", + "decap_dscp_to_tc_map": "[DSCP_TO_TC_MAP|AZURE_TUNNEL]", + "decap_tc_to_pg_map": "[TC_TO_PRIORITY_GROUP_MAP|AZURE_TUNNEL]", + "encap_tc_to_queue_map": "[TC_TO_QUEUE_MAP|AZURE_TUNNEL]", + "encap_tc_color_to_dscp_map": "[TC_TO_DSCP_MAP|AZURE_TUNNEL]" + } + } +``` + +#### 5.1.3 Define new field for extra lossless queues +Since we are going to have two extra lossless queues, while we are not going to enable watchdog on these two new queues, we need a new field to specify on which queue to enable PFC watchdog. + +* `pfc_enable` Specify on which queue to enable PFC +* `pfc_wd_sw_enable` Specify the queue(s) to enable PFC watchdog + +In current version, PFC watchdog will read `pfc_enable` to determine PFCWD is enabled on which queue(s). To maintain compatible with current logic, `db_migrator` script is required to be updated. + +```json +"PORT_QOS_MAP": { + "Ethernet0": { + "dscp_to_tc_map": "[DSCP_TO_TC_MAP|AZURE]", + "pfc_enable": "3,4,2,6", + "pfc_wd_sw_enable": "3,4", + "pfc_to_queue_map": "[MAP_PFC_PRIORITY_TO_QUEUE|AZURE]", + "tc_to_pg_map": "[TC_TO_PRIORITY_GROUP_MAP|AZURE]", + "tc_to_queue_map": "[TC_TO_QUEUE_MAP|AZURE]" + } +} +``` + +To support new field `pfc_wd_sw_enable`, [sonic-port-qos-map.yang](https://github.com/Azure/sonic-buildimage/blob/master/src/sonic-yang-models/yang-models/sonic-port-qos-map.yang) is required to be updated. + + +### 5.2 SAI attribute +TC remapping requires below SAI attributes change. +```cpp + /** + * @brief Enable TC AND COLOR -> DSCP MAP on tunnel at encapsulation (access-to-network) node to remark the DSCP in tunnel header + */ + SAI_TUNNEL_ATTR_ENCAP_QOS_TC_AND_COLOR_TO_DSCP_MAP, + + /** + * @brief Enable TC -> Queue MAP on tunnel encap + */ + SAI_TUNNEL_ATTR_ENCAP_QOS_TC_TO_QUEUE_MAP, + + /** + * @brief Enable DSCP -> TC MAP on tunnel at termination (Network-to-access) node. This map if configured overrides the port MAP + */ + SAI_TUNNEL_ATTR_DECAP_QOS_DSCP_TO_TC_MAP, + + /** + * @brief Enable TC -> Priority Group MAP. TC is derived from the tunnel MAP + */ + SAI_TUNNEL_ATTR_DECAP_QOS_TC_TO_PRIORITY_GROUP_MAP, +``` +For instance, when we get a traffic flow with DSCP = 3 on T1, the traffic and bounced back traffic is delivered and remapped as below: + +1. Traffic from `T1` to `Standby ToR` + - Traffic mapped to `TC3` and `PG3`by port level QoS mapping +2. Bounced back traffic from `Standby ToR` to `T1` + - Traffic arrived at `Standby ToR` in `TC3` and `PG3` as per port level QoS mapping + - Packet will be encapped and delivered back to `T1` by `MuxTunnel` + - The outer `DSCP` is rewritten to `2` as specified in `TC_TO_DSCP_MAP|AZURE_TUNNEL` by SAI attribute `SAI_TUNNEL_ATTR_ENCAP_QOS_TC_AND_COLOR_TO_DSCP_MAP`. + - Traffic is delivered in `Queue 2` as specified in `TC_TO_QUEUE_MAP|AZURE_TUNNEL` by SAI attribute `SAI_TUNNEL_ATTR_ENCAP_QOS_TC_TO_QUEUE_MAP` +3. Bounced back traffic from `T1` to `Active ToR` + - Bounced back traffic arrive at `T1` and `PG2` by port level QoS mapping + - Bounced back traffic will be routed to `Active ToR` +4. Traffic from `Active ToR` to `Server` + - Traffic arrived at `Active ToR` and will be decapped and delivered to server + - The outer `DSCP` is ignored as the `dscp_mode` for `MuxTunnel` is `PIPE`. The inner `DSCP3` is unchanged. + - Traffic is remapped to `TC 3` as specified in `DSCP_TO_TC_MAP|AZURE_TUNNEL` by SAI attribute `SAI_TUNNEL_ATTR_DECAP_QOS_DSCP_TO_TC_MAP` + - Traffic is remapped to `PG 2` as specified in `TC_TO_PRIORITY_GROUP_MAP|AZURE_TUNNEL` by SAI attribute `SAI_TUNNEL_ATTR_DECAP_QOS_TC_TO_PRIORITY_GROUP_MAP` + - Traffic is in `Queue 3` as per port level QoS mapping + - Decapped traffic is delivered to target server + +The new SAI attributes are to be target at branch `202012` and `202205`. +### 5.3 orchagent + +Code change in orchagent + +1. Update `tunneldecaporch` to read and set new tunnel attributes when creating decap tunnel. + + | Attribute | Value | + |---|-----------| + | SAI_TUNNEL_ATTR_DECAP_QOS_DSCP_TO_TC_MAP | [DSCP_TO_TC_MAP\|AZURE_TUNNEL]| + | SAI_TUNNEL_ATTR_DECAP_QOS_TC_TO_PRIORITY_GROUP_MAP | [TC_TO_PRIORITY_GROUP_MAP\|AZURE_TUNNEL | + +2. Update `create_tunnel` defined in `muxorch.cpp` to read and set new tunnel attributes when creating tunnel. + + | Attribute | Value | + |---|-----------| + | SAI_TUNNEL_ATTR_ENCAP_QOS_TC_AND_COLOR_TO_DSCP_MAP | [TC_TO_DSCP_MAP\|AZURE_TUNNEL]| + | SAI_TUNNEL_ATTR_ENCAP_QOS_TC_TO_QUEUE_MAP | [TC_TO_QUEUE_MAP\|AZURE_TUNNEL] | + +3. Update code for handling decap terminator +Since both the `MuxTunnel` and regular IPinIP tunnel use Loopback address `10.1.0.32` as the `dst_ip`, they will share the same decap terminator. It may pose a conflict when we are going to apply extra attributes to `MuxTunnel`. +To avoid the potential conflict, we have to create two separate tunnel terminators, one for MuxTunnel and one for regular IPinIP tunnel +- For `MuxTunnel`, the type of terminator would be `P2P` as we have a specific `src_ip`, which is the peer's Loopback address. +- For regular IPinIP tunnel, the type of terminator is unchanged, which is `P2MP`. + +## 6 Test requirement +All changes are to be covered by system test. +* Encap at standby side + + * Test case 1 Verify DSCP re-writing + * test case 2 Verify traffic is egressed at expected queue + * Test case 3 Verify PFC frame generation at expected queue + +* Decap at active side + + * Test case 1 Verify packets egressed to server at expected queue + * Test case 2 Verify PFC pause frame block expected queue + * Test case 3 Verify PFC frame generation at expected queue + + +## 7 Open Questions + + + \ No newline at end of file diff --git a/doc/sfp-cmis/Interface-Link-bring-up-sequence.md b/doc/sfp-cmis/Interface-Link-bring-up-sequence.md new file mode 100644 index 0000000000..aef8958462 --- /dev/null +++ b/doc/sfp-cmis/Interface-Link-bring-up-sequence.md @@ -0,0 +1,195 @@ +# Feature Name +Deterministic Approach for Interface Link bring-up sequence + +# High Level Design Document +#### Rev 0.7 + +# Table of Contents + * [List of Tables](#list-of-tables) + * [Revision](#revision) + * [About This Manual](#about-this-manual) + * [Abbreviation](#abbreviation) + * [References](#references) + * [Problem Definition](#problem-definition) + * [Background](#background) + * [Objective](#objective) + * [Plan](#plan) + * [Pre-requisite](#pre-requisite) + * [Breakout handling](#breakout-handling) + * [Proposed Work-Flows](#proposed-work-flows) + +# List of Tables + * [Table 1: Definitions](#table-1-definitions) + * [Table 2: References](#table-2-references) + +# Revision +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:----------------------------------:|-----------------------------------| +| 0.1 | 08/16/2021 | Shyam Kumar | Initial version +| 0.2 | 12/13/2021 | Shyam Kumar, Jaganathan Anbalagan | Added uses-cases, workflows +| 0.3 | 01/19/2022 | Shyam Kumar, Jaganathan Anbalagan | Addressed review-comments +| 0.4 | 01/26/2022 | Shyam Kumar, Jaganathan Anbalagan | Addressed further review-comments +| 0.5 | 01/28/2022 | Shyam Kumar, Jaganathan Anbalagan | Addressed further review-comments +| 0.6 | 02/02/2022 | Shyam Kumar | Added feature-enablement workflow +| 0.7 | 02/02/2022 | Jaganathan Anbalagan | Added Breakout Handling + + +# About this Manual +This is a high-level design document describing the need to have determinstic approach for +Interface link bring-up sequence and workflows for use-cases around it + +# Abbreviation + +# Table 1: Definitions +| **Term** | **Definition** | +| -------------- | ------------------------------------------------ | +| pmon | Platform Monitoring Service | +| xcvr | Transceiver | +| xcvrd | Transceiver Daemon | +| CMIS | Common Management Interface Specification | +| gbsyncd | Gearbox (External PHY) docker container | +| DPInit | Data-Path Initialization | +| QSFP-DD | QSFP-Double Density (i.e. 400G) optical module | + +# References + +# Table 2 References + +| **Document** | **Location** | +|---------------------------------------------------------|---------------| +| CMIS v4 | [QSFP-DD-CMIS-rev4p0.pdf](http://www.qsfp-dd.com/wp-content/uploads/2019/05/QSFP-DD-CMIS-rev4p0.pdf) | +| CMIS v5 | [CMIS5p0.pdf](http://www.qsfp-dd.com/wp-content/uploads/2021/05/CMIS5p0.pdf) | + + +# Problem Definition + +1. Presently in SONiC, there is no synchronization between Datapath Init operation of CMIS complaint optical module and enabling ASIC (NPU/PHY) Tx which may cause link instability during administrative interface enable “config interface startup Ethernet” configuration and bootup scenarios. + + For CMIS-compliant active (optical) modules, the Host (NPU/PHY) needs to provide a valid high-speed Tx input signal at the required signaling rate and encoding type prior to causing a DPSM to exit from DPDeactivated state and to move to DP Init transient state. + + Fundamentally it means - have a deterministic approach to bring-up the interface. + + Also, this problem is mentioned ‘as outside-the-scope’ of ‘CMIS Application Initialization’ high-level design document + **(https://github.com/ds952811/SONiC/blob/0e4516d7bf707a36127438c7f2fa9cc2b504298e/doc/sfp-cmis/cmis-init.md#outside-the-scope)** + +2. During administrative interface disable “config interface shutdown Ethernet”, only the ASIC(NPU) Tx is disabled and not the opticcal module Tx/laser. + This will lead to power wastage and un-necessary fan power consumption to keep the module temperature in operating range + +# Background + + Per the ‘CMIS spec’, ‘validation, diagnostics’ done by HW team' and 'agreement with vendors', + need to follow following bring-up seq to enable port/interface with CMIS compliant optical modules in LC/chassis: + + a) Enable port on NPU (bring-up port, serdes on the NPU ; enable signals) : syncd + b) Enable port on PHY (bring-up port, serdes on the PHY ; enable signals) : gbsyncd + - Wait for signal to stabilize on PHY + c) Enable optical module (data path initializatio, turn laser on/ enable tx) : xcvrd + + In boards not having PHY, #b) not needed but #a) and #c) sequence to be followed. + + ## Clause from CMIS4.0 spec + + Excerpt from CMIS4.0 spec providing detailed reasoning for the above-mentioned bring-up sequence + + ![61f5b485-cf3b-4ca8-beac-9102b6feabfe](https://user-images.githubusercontent.com/69485234/147173702-f124fc9d-ef27-4816-b1a1-b4a44a5833a7.PNG) + + + ## Clause from CMIS5.0 spec + + Excerpt from CMIS5.0 spec providing detailed reasoning for the above-mentioned bring-up sequence + + ![96a35dc5-618f-418c-9593-5639a90f1b28](https://user-images.githubusercontent.com/69485234/147173164-5ad0123c-479a-4774-b3ee-12a81fdd7d7e.PNG) + + +# Objective + +Have a determistic approach for Interface link bring-up sequence for all interfaces types i.e. below sequence to be followed: + 1. Initialize and enable NPU Tx and Rx path + 2. For system with 'External' PHY: Initialize and enable PHY Tx and Rx on both line and host sides; ensure host side link is up + 3. Then only perform optics data path initialization/activation/Tx enable (for CMIS complaint optical modules) and Tx enable (for SFF complaint optical modules) + +# Plan + +Plan is to follow this high-level work-flow sequence to accomplish the Objective: +- xcvrd to subscribe to a new field “host_tx_ready” in port table state-DB +- Orchagent will set the “host_tx_ready” to true/false based on the SET_ADMIN_STATE attribute return status from syncd/gbsyncd. (As part of SET_ADMIN_STATE attribute enable, the NPU Tx is enabled) +- xcvrd process the “host_tx_ready” value change event and do optics datapath init / de-init using CMIS API +- Per the discussion and agreement in sonic-chassis workgroup and OCP community, plan is to follow this proposal for all the known interfaces types- 400G/100G/40G/25G/10G. Reason being: + - CMIS complaint optical modules:- + All CMIS complaint optical modules will follow this approach as recommended in the CMIS spec. + - SFF complaint optical modules:- + - deterministic approach to bring the interface will eliminate any link stability issue which will be difficult to chase in the production network + e.g. If there is a PHY device in between, and this 'deterministic approach' is not followed, PHY may adapt to a bad signal or interface flaps may occur when the optics tx/rx enabled during PHY initialization. + - there is a possibility of interface link flaps with non-quiescent optical modules if this 'deterministic approach' is not followed + - It helps bring down the optical module laser when interface is adminstiratively shutdown. Per the workflow here, this is acheived by xcvrd listening to host_tx_ready field from PORT_TABLE of STATE_DB. Turning the laser off would reduce the power consumption and avoid any lab hazard + - Additionally provides uniform workflow (from SONiC NOS) across all interface types with or without module presence. + - This synchronization will also benefit SFP+ optical modules as they are "plug N play" and may not have quiescent functionality. (xcvrd can use the optional 'soft tx disable' ctrl reg to disable the tx) + +# Pre-requisite + +As mentioned above in 'Background' and 'Plan' sections, need to follow specified bring-up sequence. +Work flows are designed considering SONiC NOS operating in sync mode. + +In case SONiC NOS operates in async mode, then expected behavior is - the return status of the set ADMIN_STATE attribute update in ASIC-DB (syncd/GBsyncd) will be treated to set the host_tx_ready in Orchagent. + +# Breakout Handling + - The new 'host_tx_ready' field of Port table in state-DB is created for every interface . + - Xcvrd processes the 'host_tx_ready' change event and is responsible to disable Tx/laser for all optical lanes or respective optical lane that belongs to the interface in case of breakout. + - Currently the logical mapping between the interface and optical lane is not present in xcvrd. Creating this logical mapping in xcvrd will address breakout interface handling. + +# Proposed Work-Flows + +Please refer to the flow/sequence diagrams which covers the following required use-cases + - Enabling this feature + - Transceiver initialization + - admin enable configurations + - admin disable configurations + - No transceiver present + +# Feature enablement + This feature (optics Interface Link bring-up sequence) would be enabled on per platform basis. + There could be cases where vendor(s)/platform(s) may take time to shift from existing codebase to the model (work-flows) described in this document. + In order to avoid any breakage and ensure gradual migration of different platforms/vendors to this model, there would be new field (flag) in xcvrd to enable/disable this feature. + When xcvrd spawns on LC/board, it would invoke platform plugin to check with the platform (hwsku) whether this feature is yet supported on underlying platform (board/LC) or not + + Workflow : + ![Enabling 'Interface link bring-up sequence' feature(3)](https://user-images.githubusercontent.com/69485234/152266723-050377ce-d4de-4c67-a405-5acc66474d46.png) + + +# Transceiver Initialization + (at platform bootstrap layer) + +![LC boot-up sequence - optics INIT (platform bootstrap)](https://user-images.githubusercontent.com/69485234/152261613-e20dcda9-2adc-42aa-a1f1-4b8a47dd32af.png) + +# Applying 'interface admin startup' configuration + +![LC boot-up sequence - 'admin enable' Config gets applied](https://user-images.githubusercontent.com/69485234/147166867-56f3e82d-1b1c-4b7a-a867-5470ee6050e7.png) + + +# Applying 'interface admin shutdown' configuration + +![LC boot-up sequence - 'admin disable' Config gets applied](https://user-images.githubusercontent.com/69485234/147166884-92c9af48-2d64-4e67-8933-f80531d821b4.png) + +# No transceiver present +if transceiver is not present: + - All the workflows mentioned above will reamin same ( or get exercised) till host_tx_ready field update + - xcvrd will not perform any action on receiving host_tx_ready field update + + +# Out of Scope +Following items are not in the scope of this document. They would be taken up separately +1. xcvrd restart + - If the xcvrd goes for restart, then all the DB events will be replayed. + Here the Datapath init/activate for CMIS compliant optical modules, tx-disable register set (for SFF complaint optical modules), will be a no-op if the optics is already in that state +2. syncd/gbsyncd/swss docker container restart + - Cleanup scenario - Check if the host_tx_ready field in STATE-DB need to be updated to “False” for any use-case, either in going down or coming up path + - Discuss further on the possible use-cases +3. CMIS API feature is not part of this design and the APIs will be used in this design. For CMIS HLD, Please refer to: + https://github.com/Azure/SONiC/blob/9d480087243fd1158e785e3c2f4d35b73c6d1317/doc/sfp-cmis/cmis-init.md +4. Error handling of SAI attributes + a) At present, If there is a set attribute failure, orch agent will exit. + Refer the error handling API : https://github.com/Azure/sonic-swss/blob/master/orchagent/orch.cpp#L885 + b) Error handling for SET_ADMIN_STATUS attribute will be added in future. + c) A propabale way to handle the failure is to set a error handling attribute to respective container syncd/GBsyncd with attribute that is failed. + The platform layer knows the error better and it will try to recover. + diff --git a/doc/sfp-cmis/cmis-init.md b/doc/sfp-cmis/cmis-init.md new file mode 100644 index 0000000000..810d55969e --- /dev/null +++ b/doc/sfp-cmis/cmis-init.md @@ -0,0 +1,489 @@ +# Feature Name +CMIS Application Initialization + +# High Level Design Document +#### Rev 0.1 (Draft) + +# Table of Contents + * [List of Tables](#list-of-tables) + * [Revision](#revision) + * [About This Manual](#about-this-manual) + * [Abbreviation](#abbreviation) + * [References](#references) + * [Requirement](#requirement) + * [Overview](#overview) + * [Scope](#scope) + * [Outside the Scope](#outside-the-scope) + * [Functional Requirements](#functional-requirements) + * [Warm Boot Requirements](#warm-boot-requirements) + * [Functional Description](#functional-description) + * [Design](#design) + * [sonic-platform-daemons/sonic-xcvrd](#sonic-platform-daemonssonic-xcvrd) + * [Conditions for Datapath init](#conditions-for-datapath-init) + * [sonic-platform-common/sonic_platform_base/sfp_base.py](#sonic-platform-commonsonic_platform_basesfp_base.py) + * [sonic-platform-common/sonic_platform_base/sonic_xcvr/api/public/cmis.py](#sonic-platform-commonsonic_platform_basesonic_xcvrapipubliccmis.py) + * [sonic-platform-common/sonic_platform_base/sonic_xcvr/fields/consts.py](#sonic-platform-commonsonic_platform_basesonic_xcvrfieldsconsts.py) + * [sonic-platform-common/sonic_platform_base/sonic_xcvr/fields/public/cmis.py](#sonic-platform-commonsonic_platform_basesonic_xcvrfieldspubliccmis.py) + * [sonic-platform-common/sonic_platform_base/sonic_xcvr/mem_maps/public/cmis.py](#sonic-platform-commonsonic_platform_basesonic_xcvrmem_mapspubliccmis.py) + * [sonic-platform-common/sonic_platform_base/sonic_xcvr/sfp_optoe_base.py](#sonic-platform-commonsonic_platform_basesonic_xcvrsfp_optoe_base.py) + * [sonic-utilities/sfputil](#sonic-utilitiessfputil) + * [CLI commands](#cli-commands) + * [CLI Show Commands](#cli-show-commands) + * [CLI Debug Commands](#cli-debug-commands) + +# List of Tables + * [Table 1: Definitions](#table-1-definitions) + * [Table 2: References](#table-2-references) + * [Table 3: Port Table Name Mappings](#table-3-port-table-name-mappings) + * [Table 4: CMIS State Table](#table-4-cmis-state-table) + +# Revision +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:-------------------:|-------------------------------------------| +| 0.1 | 11/16/2021 | Dante (Kuo-Jung) Su | Initial version | + +# About this Manual +This document provides general information about the CMIS application initialization +support for SONiC. + +# Abbreviation + +# Table 1: Definitions +| **Term** | **Definition** | +| -------------- | ------------------------------------------------ | +| pmon | Platform Monitoring Service | +| xcvr | Transceiver | +| xcvrd | Transceiver Daemon | +| CMIS | Common Management Interface Specification | + +# References + +# Table 2 References + +| **Document** | **Location** | +|---------------------------------------------------------|---------------| +| CMIS v5 | [CMIS5p0.pdf](http://www.qsfp-dd.com/wp-content/uploads/2021/05/CMIS5p0.pdf) | + +# Requirement + +## Overview + +This document describes functional behavior of the CMIS application initialization +support in SONiC. + +The Common Management Interface Specification (CMIS) provides a variety of features +and support for different transceiver form factors. A CMIS transceiver may support +multiple application, and the application initialization sequence is now mandatory +upon port mode changes. Otherwise the link will be down if the host port mode +does not match the selected application on the CMIS transceiver. + +The feature is built on top of SONiC **sfp-refactor** framework to provide a +platform-independent solution, and the individual platforms could easily enable +this feaure by having its **Sfp** object inherited from **SfpOptoeBase**. + +**Example:** +``` +from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase + +class Sfp(SfpOptoeBase): + + def __init__(self, sfp_index): + SfpOptoeBase.__init__(self) + self.index = sfp_index + + def get_port_type(self): + return self.SFP_PORT_TYPE_QSFPDD + + def get_eeprom_path(self): + # platform-specific per-port I2C bus + bus_id = 32 + self.index + return "/sys/bus/i2c/devices/{}-0050/eeprom".format(bus_id) +``` + +## Scope + +The scope of this feature are as follows: + +- **CMIS software initialization for the default application.** + - All the lanes of the CMIS module will be reconfigured to use the default application. + - Only staged control set 0 will be supported + - No speed negotiation. + - No custom signal integrity settings + - Implement the procedures defined in Appendix D.1.3 and D.2.2 of [CMIS v5](http://www.qsfp-dd.com/wp-content/uploads/2021/05/CMIS5p0.pdf) +- **sonic-platform-common**: Update **sonic-xcvr** for CMIS application advertising and initialization +- **sonic-platform-daemons**: Update **sonic-xcvrd** for state-based CMIS application initialization +to support multiple CMIS transceivers in one single thread +- **sonic-utilities**: Update the **sfputil** and **sfpshow** for CMIS application advertisement + +## Outside the Scope + +The following items are outside the scope of this document: + +- The synchronization between syncd and xcvrd + The datapath initialization should happen only when the Tx signals from the ASIC/MAC towards + the optics is valid and good for the selected application (i.e. 6.3.3.5 DPInit State of [CMIS v5](http://www.qsfp-dd.com/wp-content/uploads/2021/05/CMIS5p0.pdf)) + Currently, **pmon#xcvrd** do not have any means to know this at runtime whether the port breakout + configuration is done or ASIC/MAC is ready in the desired mode. +- The non-default application is not supported + Only application 1 (i.e. default application) is supported, the port configurations that + require non-default application code may experience link failures on CMIS v4/v5 optics. + +## Functional Requirements + +1. Ability to parse the advertised applications from the transceivers +2. Ability to post the advertised applications to the STATE_DB +3. Ability to support multiple CMIS transceivers in one single thread. +4. Ability to detect the errors of the application initialization on the transceivers + +**Note:** +The duration of the CMIS application initialization greatly differs from transceivers +to transceivers, while some take 3 seconds for activating the 4x100G mode, some take 15 seconds. + +## Warm Boot Requirements + +Functionality should continue to work across warm boot. +- The CMIS application initialization should not be performed during WarmBoot. +- The CMIS application initialization should be skipped if no application code updates. + +# Functional Description + +- The **pmon#xcvrd** should detect the module type of the attched CMIS transceiver and +post its module information onto the **STATE_DB** +- When a CMIS transceiver is attached, the **show interfaces transceiver eeprom** +should display the advertised applications of the transceiver. +- When a CMIS transceiver is attached, the **sfputil** should be capable of reporting +the errors of the CMIS application initialization. +- When a CMIS transceiver is detected, the **pmon#xcvrd** should automatically activate +the appropriate application as per the current port mode. +- Only the default application mode will be supported for now, we'll deal with the dynamic +application update later when the synchronization mechanism between **syncd** and **pmon#xcvrd** +is in place and ready. + +# Design + +## sonic-platform-daemons/sonic-xcvrd + +The transceiver daemon will be enhanced as below: + +- The **application_advertisement** in the **TRANSCEIVER_INFO** table is now updated +to the json format: + +**Original:** +``` + "TRANSCEIVER_INFO|Ethernet0": { + "type": "hash", + "value": { + "application_advertisement": "400GAUI-8 C2M (Annex 120E) - 400GBASE-DR4 (Cl 124)\n 100GAUI-2 C2M (Annex 135G) - 100G-FR/100GBASE-FR1 (Cl 140)", + ...... omitted ...... + } + }, +``` + +**Modified:** +``` + "TRANSCEIVER_INFO|Ethernet0": { + "type": "hash", + "value": { + "application_advertisement": "{1: {'host_electrical_interface_id': '400GAUI-8 C2M (Annex 120E)', 'module_media_interface_id': '400GBASE-DR4 (Cl 124)', 'host_lane_count': 8, 'media_lane_count': 4, 'host_lane_assignment_options': 1, 'media_lane_assignment_options': None}, 2: {'host_electrical_interface_id': '100GAUI-2 C2M (Annex 135G)', 'module_media_interface_id': '100G-FR/100GBASE-FR1 (Cl 140)', 'host_lane_count': 2, 'media_lane_count': 1, 'host_lane_assignment_options': 85, 'media_lane_assignment_options': None}}", + ...... omitted ...... + } + }, + ...... omitted ...... +``` + +- **xcvrd_utilities/port_mapping.py**: Add support for **APPL_DB** and **STATE_DB** + - **subscribe_port_update_event(db_list=['APPL_DB', 'STATE_DB'])** subscribes to both **APPL_DB** and **STATE_DB** by default. + + - **handle_port_update_event(sel, asic_context, stop_event, logger, handler)** is the wrapper for port update event handler, + and the handler routines will be invoked upon all the database SET/DEL commands as follows + - **PortChangeEvent.PORT_SET** + The event for database **SET** operations. + - **PortChangeEvent.PORT_DEL** + The event for database **DEL** operations. + + - The port table names associated with the database are as follows + + ### Table 3 Port Table Name Mappings + | Database | Table Name | + |:--------- |:---------------------- | + | APPL_DB | PORT_TABLE | + | CONFIG_DB | PORT | + | STATE_DB | TRANSCEIVER_INFO | + + - The port update events from **APPL_DB** should be interpreted as the port config update + notifications from the **swss#orchagent** + - The port update events from **STATE_DB** should be interpreted as the transceiver insertion + and removal notifications from the **xcvrd#SfpStateUpdateTask** + - Upon **WARM-REBOOT** and **pmon** restart, these events will always be replayed to the + **CmisManagerTask** and the initialization sequence will be skipped if the desired application + is already in place and ready. (For more details, please refer to the summary of **xcvrd.py** below) + +- **xcvrd.py**: Add **CmisManagerTask** for the state-based CMIS application initialization +to support multiple CMIS transceivers in one single thread. + - The CMIS states are listed below + + ### Table 4 CMIS State Table + | State | Description | Next State | + |:--------- |:----------------|:----------------------| + | UNKNOWN | Unknown state | N/A | + | INSERTED | Module is newly inserted | DP_DEINIT | + | DP_DEINIT | DatPath is de-initialized with tx-power turned off | AP_CONFIGURED | + | AP_CONFIGURED | Application configured | DP_INIT | + | DP_INIT | DatPath is initialized with tx-power turned off | DP_TXON | + | DP_TXON | DatPath is initialized with tx-power turned on | READY | + | READY | Transceiver is ready in the new application mode | N/A | + | REMOVED | Module is removed | N/A | + | FAILED | Initialization failed | N/A | + + - At each loop iteration of CmisManagerTask.task_worker(), the state will only be advanced by 1 and only 1 state upon success. + - Prior to advancing the state, CmisManagerTask should always double-check the hardware module and datapath states. + - Prior to handling the CMIS state transitions, the following checkers are always performed + - **Check for the transceiver presence** via sfp.get_presence(), + abort the initialization sequence if it's no loner present + - **Validate the transceiver module type** via sfp.get_transceiver_info()['type_abbrv_name'], + abort the initialization sequence if it's not a QSFP-DD + - **Validate the transceiver memory type** via sfp.get_transceiver_info()['memory_type'], + abort the initialization sequence if the **Paged** memory is not available. + - From **INSERTED** to **DP_DEINIT** + Skip the initialization sequence by having the state transitioned to **READY** + if no application code updates and DataPath state is 4 (i.e. DataPathActivated) and + config state is 1 (i.e. ConfigSuccess), otherwise invoke sfp.set_cmis_application_stop() + and have the state transitioned to **DP_DEINIT** + - From **DP_DEINIT** to **AP_CONFIGURED** + Stay at **DP_DEINIT** state if module state != **ModuleReady(3)**, otherwise invoke + sfp.set_cmis_application_apsel() and have the state transitioned to **AP_CONFIGURED** + - From **AP_CONFIGURED** to **DP_INIT** + Stay at **AP_CONFIGURED** state if config state != **ConfigSuccess(1)**, otherwise + invoke sfp.set_cmis_application_start() and have the state transitioned to **DP_INIT** + - From **DP_INIT** to **DP_TXON** + Stay at **DP_INIT** state if DataPath state != **DataPathInitialized(7)**, otherwise + invoke sfp.set_cmis_application_txon() and have the state transitioned to **DP_TXON** + - From **DP_TXON** to **READY** + Stay at **DP_TXON** state if DataPath state != **DataPathActivated(4)**, otherwise have the + state transitioned to **READY** + - The CMIS state transition diagram + ![](images/001.png) + + ### Conditions for Datapath init + + The datapath should be re-initialized in the following scenarios + + - Transceiver insertion detection + When a CMIS transceiver insertion is detected, it will be placed in **INSERTED** state, + the datapath re-initialization should be skipped and directly transitioned to **READY** state + if all the following checkers are positive + - The operational application code matches the desired application mode derived from + the current port configurations in the CONFIG_DB/APPL_DB + - The datapath state is **DataPathActivated(4)** + - The configuration error is **ConfigSuccess(1)** + - Port mode changes that require a CMIS application code update (e.g Dynamic Port Breakout, + outside the scope of this document) + - Port speed changes that require a CMIS application code update + +## sonic-platform-common/sonic_platform_base/sfp_base.py + +Add the following macro constants to differentiate the port/cage type from the media type fetched from the transceiver +- SFP_PORT_TYPE_UNSPECIFIED +- SFP_PORT_TYPE_SFP +- SFP_PORT_TYPE_QSFP +- SFP_PORT_TYPE_QSFPDD + +Add the following stub routines +- get_port_type(self) + Retrieves the port/cage type of this SFP + +## sonic-platform-common/sonic_platform_base/sonic_xcvr/api/public/cmis.py + +- Add support for low-power mode controls +- Add support for reporting CMIS application advertisement +- Add support for reporting CMIS application initialization failures +- Add support for CMIS application initialization + For more details, please refer to **Appendix D.1.3 and D.2.2** of +[CMIS v5](http://www.qsfp-dd.com/wp-content/uploads/2021/05/CMIS5p0.pdf) + +## sonic-platform-common/sonic_platform_base/sonic_xcvr/fields/consts.py + +- Add register definitions for CMIS application initialization + +## sonic-platform-common/sonic_platform_base/sonic_xcvr/fields/public/cmis.py + +- Add support for parsing CMIS application advertisement + +## sonic-platform-common/sonic_platform_base/sonic_xcvr/mem_maps/public/cmis.py + +- Add support for CMIS registers those are necessary for application initialization + +## sonic-platform-common/sonic_platform_base/sonic_xcvr/sfp_optoe_base.py + +- Add the following routines for CMIS application initialization + - **has_cmis_application_update(self, host_speed, host_lanes)** + Check for CMIS updates and the new application if an update if necessary + - **set_cmis_application_stop(self, host_lanes)** + A non-blocking routine to deinitialize DataPath, put the CMIS module into low-power mode + and finally reactivate high-power mode, the users are supposed to check the module state + prior to configuring application code by **set_cmis_application_apsel()** + - **set_cmis_application_apsel(self, host_lanes, appl_code=1)** + A non-blocking routine to configure the application code, the users are supposed to check + the config error prior to initialize the DataPath by **set_cmis_application_start()** + - **set_cmis_application_start(self, host_lanes)** + A non-blocking routine to configure initialize the DataPath, the users are supposed to check + the DataPath state prior to turn on the Tx power by **set_cmis_application_txon()** + - **set_cmis_application_txon(self, host_lanes)** + A non-blocking routine to turn on the Tx power, the users could later check the DataPath state + to see if the module is correctly brought up. + - **get_cmis_state(self)** + Retrieve the CMIS module state, config error and DataPath state. + - **get_error_description(self)** + Report the CMIS application initialization failures + - **get_lpmode(self)** + Check if the CMIS module is placed in low-power mode + - **set_lpmode(self, lpmode)** + Enable/Disable the low-power mode + - **get_module_state(self)** + Retrieve the CMIS module state + +## sonic-utilities/sfputil + +- Add supprot to show the CMIS application advertisement + +## CLI commands + +### CLI Show Commands + +#### show interfaces transceiver eeprom + +This utility is now updated as below. + +**Original** +``` +admin@sonic:~$ show interfaces transceiver eeprom Ethernet0 +Ethernet0: SFP EEPROM detected + Application Advertisement: {1: {'host_electrical_interface_id': '400GAUI-8 C2M (Annex 120E)', 'module_media_interface_id': '400GBASE-DR4 (Cl 124)', 'host_lane_count': 8, 'media_lane_count': 4, 'host_lane_assignment_options': 1, 'media_lane_assignment_options': None}, 2: {'host_electrical_interface_id': '100GAUI-2 C2M (Annex 135G)', 'module_media_interface_id': '100G-FR/100GBASE-FR1 (Cl 140)', 'host_lane_count': 2, 'media_lane_count': 1, 'host_lane_assignment_options': 85, 'media_lane_assignment_options': None}} + Connector: SN optical connector + Encoding: N/A + Extended Identifier: Power Class 6 (12.0W Max) + Extended RateSelect Compliance: N/A + Identifier: QSFP-DD Double Density 8X Pluggable Transceiver + Length cable Assembly(m): 0.0 + Nominal Bit Rate(100Mbs): 0 + Specification compliance: sm_media_interface + Vendor Date Code(YYYY-MM-DD Lot): 2020-10-07 + Vendor Name: AVAGO + Vendor OUI: 00-17-6a + Vendor PN: AFCT-93DRPHZ-AZ2 + Vendor Rev: 01 + Vendor SN: FD2038FG0FY +``` + +**Modified** +``` +admin@sonic:~$ show interfaces transceiver eeprom Ethernet0 +Ethernet0: SFP EEPROM detected + Application Advertisement: + 1: 400GAUI-8 C2M (Annex 120E) | 400GBASE-DR4 (Cl 124) + 2: 100GAUI-2 C2M (Annex 135G) | 100G-FR/100GBASE-FR1 (Cl 140) + Connector: SN optical connector + Encoding: N/A + Extended Identifier: Power Class 6 (12.0W Max) + Extended RateSelect Compliance: N/A + Identifier: QSFP-DD Double Density 8X Pluggable Transceiver + Length cable Assembly(m): 0.0 + Nominal Bit Rate(100Mbs): 0 + Specification compliance: sm_media_interface + Vendor Date Code(YYYY-MM-DD Lot): 2020-10-07 + Vendor Name: AVAGO + Vendor OUI: 00-17-6a + Vendor PN: AFCT-93DRPHZ-AZ2 + Vendor Rev: 01 + Vendor SN: FD2038FG0FY +``` + +#### sudo sfputil show eeprom + +This utility is now updated as below. + +**Original** +``` +admin@sonic:~$ sudo sfputil show eeprom -p Ethernet0 +Ethernet0: SFP EEPROM detected + Application Advertisement: {1: {'host_electrical_interface_id': '400GAUI-8 C2M (Annex 120E)', 'module_media_interface_id': '400GBASE-DR4 (Cl 124)', 'host_lane_count': 8, 'media_lane_count': 4, 'host_lane_assignment_options': 1, 'media_lane_assignment_options': None}, 2: {'host_electrical_interface_id': '100GAUI-2 C2M (Annex 135G)', 'module_media_interface_id': '100G-FR/100GBASE-FR1 (Cl 140)', 'host_lane_count': 2, 'media_lane_count': 1, 'host_lane_assignment_options': 85, 'media_lane_assignment_options': None}} + Connector: SN optical connector + Encoding: N/A + Extended Identifier: Power Class 6 (12.0W Max) + Extended RateSelect Compliance: N/A + Identifier: QSFP-DD Double Density 8X Pluggable Transceiver + Length cable Assembly(m): 0.0 + Nominal Bit Rate(100Mbs): 0 + Specification compliance: sm_media_interface + Vendor Date Code(YYYY-MM-DD Lot): 2020-10-07 + Vendor Name: AVAGO + Vendor OUI: 00-17-6a + Vendor PN: AFCT-93DRPHZ-AZ2 + Vendor Rev: 01 + Vendor SN: FD2038FG0FY +``` + +**Modified** +``` +admin@sonic:~$ sudo sfputil show eeprom -p Ethernet0 +Ethernet0: SFP EEPROM detected + Application Advertisement: + 1: 400GAUI-8 C2M (Annex 120E) | 400GBASE-DR4 (Cl 124) + 2: 100GAUI-2 C2M (Annex 135G) | 100G-FR/100GBASE-FR1 (Cl 140) + Connector: SN optical connector + Encoding: N/A + Extended Identifier: Power Class 6 (12.0W Max) + Extended RateSelect Compliance: N/A + Identifier: QSFP-DD Double Density 8X Pluggable Transceiver + Length cable Assembly(m): 0.0 + Nominal Bit Rate(100Mbs): 0 + Specification compliance: sm_media_interface + Vendor Date Code(YYYY-MM-DD Lot): 2020-10-07 + Vendor Name: AVAGO + Vendor OUI: 00-17-6a + Vendor PN: AFCT-93DRPHZ-AZ2 + Vendor Rev: 01 + Vendor SN: FD2038FG0FY +``` + +### CLI Debug Commands + +#### sfputil show error-status + +This utility is also enhanced to detect CMIS failures + +Example: +``` +admin@sonic:~$ sudo sfputil show error-status +Port Error Status +----------- -------------- +Ethernet0 OK +Ethernet8 ConfigRejected +Ethernet16 DataPathDeinit +Ethernet24 Unplugged +Ethernet32 Unplugged +``` + +#### show logging xcvrd + +Please use **show logging xcvrd | grep CMIS** for the debug logs + +Example: +``` +admin@sonic:~$ show logging xcvrd | grep CMIS +Nov 19 07:28:53.878758 sonic NOTICE pmon#xcvrd[34]: CMIS: Starting... +Nov 19 07:28:54.571593 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=INSERTED +Nov 19 07:28:55.359298 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=INSERTED +Nov 19 07:28:57.203301 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=DP_DEINIT +Nov 19 07:28:57.874821 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=DP_DEINIT +Nov 19 07:28:59.550191 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=DP_DEINIT +Nov 19 07:29:00.287885 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=DP_DEINIT +Nov 19 07:29:02.059347 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=AP_CONFIGURED +Nov 19 07:29:02.781345 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=AP_CONFIGURED +Nov 19 07:29:04.504715 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=DP_INIT +Nov 19 07:29:05.210270 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=DP_INIT +Nov 19 07:29:06.877637 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=DP_TXON +Nov 19 07:29:06.902918 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet32: 400G, 8-lanes, state=READY +Nov 19 07:29:07.545593 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=DP_TXON +Nov 19 07:29:07.581243 sonic NOTICE pmon#xcvrd[34]: CMIS: Ethernet0: 400G, 8-lanes, state=READY +``` diff --git a/doc/sfp-cmis/images/001.png b/doc/sfp-cmis/images/001.png new file mode 100644 index 0000000000..c764e234ed Binary files /dev/null and b/doc/sfp-cmis/images/001.png differ diff --git a/doc/sonic-application-extention/img/docker-infra-concepts.svg b/doc/sonic-application-extension/img/docker-infra-concepts.svg similarity index 100% rename from doc/sonic-application-extention/img/docker-infra-concepts.svg rename to doc/sonic-application-extension/img/docker-infra-concepts.svg diff --git a/doc/sonic-application-extention/img/feature-start.svg b/doc/sonic-application-extension/img/feature-start.svg similarity index 100% rename from doc/sonic-application-extention/img/feature-start.svg rename to doc/sonic-application-extension/img/feature-start.svg diff --git a/doc/sonic-application-extention/img/feature-stop.svg b/doc/sonic-application-extension/img/feature-stop.svg similarity index 100% rename from doc/sonic-application-extention/img/feature-stop.svg rename to doc/sonic-application-extension/img/feature-stop.svg diff --git a/doc/sonic-application-extention/img/install-flow.svg b/doc/sonic-application-extension/img/install-flow.svg similarity index 100% rename from doc/sonic-application-extention/img/install-flow.svg rename to doc/sonic-application-extension/img/install-flow.svg diff --git a/doc/sonic-application-extention/img/packages-json.svg b/doc/sonic-application-extension/img/packages-json.svg similarity index 100% rename from doc/sonic-application-extention/img/packages-json.svg rename to doc/sonic-application-extension/img/packages-json.svg diff --git a/doc/sonic-application-extention/img/sonic-package-integration.svg b/doc/sonic-application-extension/img/sonic-package-integration.svg similarity index 100% rename from doc/sonic-application-extention/img/sonic-package-integration.svg rename to doc/sonic-application-extension/img/sonic-package-integration.svg diff --git a/doc/sonic-application-extention/img/sonic-pkg-basic-concepts.svg b/doc/sonic-application-extension/img/sonic-pkg-basic-concepts.svg similarity index 100% rename from doc/sonic-application-extention/img/sonic-pkg-basic-concepts.svg rename to doc/sonic-application-extension/img/sonic-pkg-basic-concepts.svg diff --git a/doc/sonic-application-extention/img/uninstall-flow.svg b/doc/sonic-application-extension/img/uninstall-flow.svg similarity index 100% rename from doc/sonic-application-extention/img/uninstall-flow.svg rename to doc/sonic-application-extension/img/uninstall-flow.svg diff --git a/doc/sonic-application-extention/img/upgrade-flow.svg b/doc/sonic-application-extension/img/upgrade-flow.svg similarity index 100% rename from doc/sonic-application-extention/img/upgrade-flow.svg rename to doc/sonic-application-extension/img/upgrade-flow.svg diff --git a/doc/sonic-application-extension/sonic-application-extension-guide.md b/doc/sonic-application-extension/sonic-application-extension-guide.md new file mode 100644 index 0000000000..0f22ff5eb9 --- /dev/null +++ b/doc/sonic-application-extension/sonic-application-extension-guide.md @@ -0,0 +1,216 @@ + +# SONiC Application Extension Guide + + +#### Rev 0.1 + + +## Table of Content +- Revision +- Scope +- Porting an existing SONiC Docker image to be an Application Extension +- Developing a new SONiC Application Extension +- Adding 3rd party application to SONiC package database +- Building SONiC image with 3rd party application +- Manifest Reference + +### Revision + +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:-----------------------:|--------------------------------------| +| 0.1 | 02/2021 | Stepan Blyshchak | Initial Proposal | + +### Scope + +This document gives developers a quick guide through developing new application extensions or porting existing SONiC docker images into application extension compatible ones. + +It is recommended to get acquainted with [HLD](sonic-application-extention-hld.md) document before reading this document. + +### Porting an existing SONiC Docker image to be an Application Extension + +It is possible to port existing SONiC docker image and make it an Application Extension. + +An example of porting DHCP relay - https://github.com/Azure/sonic-buildimage/commit/b3b6938fda9244607fb00bfd36a74bccab0c38a9. + +1. Add a new build time flag to SONiC build system to control whether to include new Docker Image *XXX*: + +Makefile.work +```makefile +INCLUDE_XXX=$(INCLUDE_XXX) +``` + +2. Register this Docker image in SONIC_PACKAGES_LOCAL target group and remove from SONIC_INSTALL_DOCKER_IMAGES: + + +rules/docker-XXX.mk +```makefile +ifeq ($(INCLUDE_XXX), y) +ifeq ($(INSTALL_DEBUG_TOOLS), y) +SONIC_PACKAGES_LOCAL += $(DOCKER_XXX_DBG) +else +SONIC_PACKAGES_LOCAL += $(DOCKER_XXX) +endif +endif +``` + +3. Remove $(DOCKER_XXX)_RUN_OPT and replace with variables used to generate manifest for the docker: + +rules/docker-XXX.mk +```makefile +$(DOCKER_XXX)_CONTAINER_PRIVILEGED = true +$(DOCKER_XXX)_CONTAINER_VOLUMES += /etc/sonic:/etc/sonic:ro +$(DOCKER_XXX)_CONTAINER_VOLUMES += /usr/share/sonic/scripts:/usr/share/sonic/scripts:ro +$(DOCKER_XXX)_CONTAINER_TMPFS += /tmp/ +```` + +These variables are used to generate manifest for docker at build time (see generate_manifest function in https://github.com/Azure/sonic-buildimage/blob/master/rules/functions): + +4. For extensions that provide CLI commands a CLI plugin is needed. + +4.1. Remove extension CLI commands from sonic-utilities code. + +4.2. Move the implementation as a separate file into sonic-buildimage under dockers/docker-xxx/cli folder. + +This plugin has to implement ```register``` function needed to be loaded by sonic-utilities core: + +dockers/docker-xxx/cli/show.py +```python +import click + +@click.command() +def example() + pass + +def register(cli): + cli.add_command(example) +``` + +```register``` may throw an exception, in this case main CLI will still work but a warning will be printed to the user. + + +4.3 UT for CLI: + +In case CLI has unit tests, they need to be moved from sonic-utilities into sonic-buildimage repository and placed under dockers/docker-xxx/cli-plugins-tests/. + +In case this folder exists tests are executed at docker image build time and test log is recorder in target/docker-xxx.gz.log. + +The command line to execute tests is: + +``` +pytest-3 -v +``` + +### Developing a new SONiC Application Extension + +To develop a new SONiC Application Extension use the following example extension as a template: +https://github.com/stepanblyschak/sonic-example-extension. + +Prerequisites, build instructions and installation instructions are present in repository README.md file. + +### Adding 3rd party application to SONiC package database + +Modify files/build_templates/packages.json.j2 to include new package. Example for the previous sonic-example-extension - *cpu-report*: + +```json +{ + "cpu-report": { + "repository": "stepanblyschak/cpu-report", + "description": "CPU report example", + "default-reference": "1.0.0" + } +} +``` + +### Building SONiC image with 3rd party application + +To build SONiC image with 3rd party application pre-installed use SONIC_PACKAGES target group. +See https://github.com/Azure/sonic-buildimage/blob/master/rules/sonic-packages.mk. + +Create a file under rules/ called rules/cpu-report.mk with the following content: +```makefile +CPU_REPORT = cpu-report +$(CPU_REPORT)_REPOSITORY = stepanblyschak/cpu-report +$(CPU_REPORT)_VERSION = 1.0.0 +SONIC_PACKAGES += $(CPU_REPORT) +``` + +Additional options: + +``` +$(CPU_REPORT)_DEFAULT_FEATURE_STATE_ENABLED # "y" or "n" - whether feature is enabled by default at system start. Sets enabled in the FEATURE table. Disabled by default. +$(CPU_REPORT)_DEFAULT_FEATURE_OWNER # "local" or "kube". Default is "local". +``` + +### Manifest Reference + +Label name the manifest content should be written to: +``` +com.azure.sonic.manifest +``` + +The value should contain a JSON serialized as a string. + +| Path | Type | Mandatory | Description | +| ---------- | ------ | --------- | ------------------------------------------------------------------------------------------------ | +| /version | string | no | Version of manifest schema definition. Defaults to 1.0.0. | +| /package | object | no | Package related metadata information. | +| /package/version | string | yes | Version of the package. | +| /package/name | string | yes | Name of the package. | +| /package/description | string | no | Description of the package. | +| /package/depends | list | no | List of SONiC packages the service depends on. Defaults to [] | +| /package/depends[index]/name | string | yes | Name of SONiC Package | +| /package/depends[index]/version | string | no | Version constraint expression string | +| /package/depends/[index]/components | object | no | Per component version | +| /package/breaks | list | no | List of SONiC package the service breaks with. Defaults to [] | +| /package/breaks[index]/name | string | yes | Name of SONiC Package | +| /package/breaks[index]/version | string | no | Version constraint expression string | +| /package/breaks/[index]/components | object | no | Per component version | +| /package/base-os/ | object | no | Base OS versions constraints | +| /package/base-os/[index]/name | strnig | yes | Base OS component name | +| /package/base-os/[index]/version | string | yes | Base OS component version | +| /package/changelog | dict | no | Changelog dictionary. | +| /package/changelog/\ | dict | yes | Package version. | +| /package/changelog/\/changes | list of strings | yes | Changelog messages for a given version. | +| /package/changelog/\/author | string | yes | Author name. | +| /package/changelog/\/email | string | yes | Author's email address. | +| /package/changelog/\/date | string | yes | Date and time in RFC 2822 format. | +| /package/init-cfg | dict | no | Default package configuration in CONFIG DB format. Defaults to {} | +| /package/debug-dump | string | No | A command to be executed during system dump | +| /service | object | yes | Service management related properties. | +| /service/name | string | yes | Name of the service. There could be two packages e.g: fpm-quagga, fpm-frr but the service name is the same "bgp". For such cases each one have to declare the other service in "conflicts". | +| /service/requires | list of strings | no | List of SONiC services the application requires.

The option maps to systemd's unit "Requires=". | +| /service/requisite | list of strings | no | List of SONiC services that are requisite for this package.

The option maps to systemd's unit "Requisite=". | +| /service/wanted-by | list of strings | no | List of SONiC services that wants for this package.

The option maps to systemd's unit "WantedBy=". | +| /service/after | list of strings | no | Boot order dependency. List of SONiC services the application is set to start after on system boot. | +| /service/before | list of strings | no | Boot order dependency. List of SONiC services the application is set to start before on system boot. | | +| /service/delayed | boolean | no | Wether to generate a timer to delay the service on boot. Defaults to false. | +| /service/dependent-of | lits of strnigs | no | List of SONiC services this application is dependent of.

Specifying in this option a service X, will regenerate the /usr/local/bin/X.sh script and upgrade the "DEPENDENT" list with this package service.

This option is warm-restart related, a warm-restart of service X will not trigger this package service restart.

On the other hand, this service package will be started, stopped, restarted togather with service X.

Example:

For "dhcp-relay", "radv", "teamd" this field will have "swss" service in the list. | +| /service/post-start-action | string | no | Path to an executable inside Docker image filesystem to be executed after container start.

A package may use this field in case a systemd service should not reach started state before some condition. E.g.: A database service should not reach started state before redis process is not ready. Since, there is no control, when the redis process will start a "post-start-action" script may execute "redis-cli ping" till the ping is succeessful. | +| /service/pre-shutdown-action | string | no | Path to an executable inside Docker image filesystem to be executed before container stops.

A uses case is to execute a warm-shutdown preparation script.

A script that sends SIGUSR1 to teamd to initiate warm shutdown is one of such examples. | +| /service/host-service | boolean | no | Multi-ASIC field. Wether a service should run in host namespace. Default is True. | +| /service/asic-service | boolean | no | Multi-ASIC field. Wether a service should run per ASIC namespace. Default is False. | +| /service/warm-shutdown/ | object | no | Warm reboot related properties. Used to generate the warm-reboot script. | +| /service/warm-shutdown/after | lits of strings | no | Warm shutdown order dependency. List of SONiC services the application is set to stop after on warm shutdown.

Example: a "bgp" may specify "radv" in this field in order to avoid radv to announce departure and cause hosts to lose default gateway.

*NOTE*: Putting "radv" here, does not mean the "radv" should be installed as there is no such dependency for the "bgp" package. | +| /service/warm-shutdown/before | lits of strings | no | Warm shutdown order dependency. List of SONiC services the application is set to stop before on warm shutdown.

Example: a "teamd" service has to stop before "syncd", but after "swss" to be able to send the last LACP PDU though CPU port right before CPU port becomes unavailable. | +| /service/fast-shutdown/ | object | no | Fast reboot related properties. Used to generate the fast-reboot script. | +| /service/fast-shutdown/after | lits of strings | no | Same as for warm-shutdown. | +| /service/fast-shutdown/before | lits of strings | no | Same as for warm-shutdown. | +| /processes | object | no | Processes infromation | +| /processes/[name]/reconciles | boolean | no | Wether process performs warm-boot reconciliation, the warmboot-finalizer service has to wait for. Defaults to False. | +| /container | object | no | Container related properties. | +| /container/privileged | string | no | Start the container in privileged mode. Later versions of manifest might extend container properties to include docker capabilities instead of privileged mode. Defaults to False. | +| /container/volumes | list of strings | no | List of mounts for a container. The same syntax used for '-v' parameter for "docker run".

Example: "\:\:\". Defaults to []. | +| /container/mounts | list of objects | no | List of mounts for a container. Defaults to []. | +| /container/mounts/[id]/source | string | yes | Source for mount | +| /container/mounts/[id]/target | string | yes | Target for mount | +| /container/mounts/[id]/type | string | yes | Type for mount. See docker mount types. | +| /container/tmpfs | list of strings | no | Tmpfs mounts. Defaults to [] | +| /container/environment | dict | no | Environment variables for Docker container (key=value). Defaults to {}. | +| /processes | list | no | A list defining processes running inside the container. | +| /cli | object | no | CLI plugin information. *NOTE*: Later will deprecated and replaced with a YANG module file path. | +| /cli/mandatory | boolean| no | Wether CLI is a mandatory functionality for the package. Default: False. | +| /cli/show-cli-plugin | string | no | A path to a plugin for sonic-utilities show CLI command. | +| /cli/config-cli-plugin | string | no | A path to a plugin for sonic-utilities config CLI command. | +| /cli/clear-cli-plugin | string | no | A path to a plugin for sonic-utilities sonic-clear CLI command. | + + diff --git a/doc/sonic-application-extention/sonic-application-extention-hld.md b/doc/sonic-application-extension/sonic-application-extention-hld.md similarity index 100% rename from doc/sonic-application-extention/sonic-application-extention-hld.md rename to doc/sonic-application-extension/sonic-application-extention-hld.md diff --git a/doc/sonic-application-extention/sonic-versioning-strategy.md b/doc/sonic-application-extension/sonic-versioning-strategy.md similarity index 100% rename from doc/sonic-application-extention/sonic-versioning-strategy.md rename to doc/sonic-application-extension/sonic-versioning-strategy.md diff --git a/doc/sonic-build-system/build-enhancements.md b/doc/sonic-build-system/build-enhancements.md new file mode 100644 index 0000000000..a86ae53750 --- /dev/null +++ b/doc/sonic-build-system/build-enhancements.md @@ -0,0 +1,561 @@ + + +# Build Improvements HLD + +#### Rev 0.2 + +# Table of Contents + +- [List of Tables](#list-of-tables) +- [Revision](#revision) +- [Definition/Abbreviation](#definitionabbreviation) +- [About This Manual](#about-this-manual) +- [Introduction and Scope](#1-introduction-and-scope) + - [Current build infrastructure](#11-existingtools-limitation) + - [Benefits of this feature](#12-benefits-of-this-feature) +- [Feature Requirements](#2-feature-requirements) + - [Functional Requirements](#21-functional-requirements) + - [Configuration and Management Requirements](#22-configuration-and-management-requirements) + - [Scalability Requirements](#23-scalability-requirements) + - [Warm Boot Requirements](#24-warm-boot-requirements) +- [Feature Description](#3-feature-description) +- [Feature Design](#4-feature-design) + - [Overview](#41-design-overview) + - [Docker-in-Docker build](#42-db-changes) + - [SONIC version cache build](#42-db-changes) + - [Installer Image Optimization](#42-db-changes) +- [Serviceability and Debug](#6-serviceability-and-debug) +- [Warm reboot Support](#7-warm-reboot-support) +- [Unit Test Cases ](#8-unit-test-cases) +- [References ](#9-references) + +# List of Tables + +[Table 1: Abbreviations](#table-1-abbreviations) + +# Revision +| Rev | Date | Author | Change Description | +|:--:|:--------:|:-----------------:|:------------------------------------------------------------:| +| 0.1 | | Kalimuthu Velappan | Initial version | + + +# Definition/Abbreviation + +### Table 1: Abbreviations + +| **Term** | **Meaning** | +| -------- | ----------------------------------------- | +| DPKG | Debian Package | +| DinD | Docker-in-Docker | +| DooD | Docker-out-of-Docker | + + +# About this Manual + +This document provides general information about the build improvements in SONiC. + + +# Introduction and Scope + +This document describes the Functionality and High level design of the build improvement in SONiC. + +- The current SONiC environment uses container environment for generating the sonic packages, docker container images and installer images with rootfs. +- On every soonic build, it downloads source code, binary packages, docker images and other tools and utilities from an external world and generates the build artifacts. +- Inter-dependency between the targets could prevent the build parallelism and cause more delay in the overall build time. +- Nested docker container would slowdown the Hardware resource access - CPU, memory, Network and Filesystem. + + +This feature provides improvements in three essential areas. +- Multi user build + - Parallel build using Native docker mode. + - OverlayFS to virtualize the build root. +- Build time Optimization + - Parallel make jobs - Passing dh '-parallel' flags to all the build targets. + - Binary image build optimization + - Use tmpfs and OverlayFS to speed up the build process. +- Build caching + - Version cache - Package cache support for build componets that are downloaded from external world. + - Image cache support for installer image componets. + +Reference: +- Version caching feature is enhanced on top of DPKG caching and Versioning framework. +Ref: + - https://github.com/Azure/SONiC/blob/master/doc/sonic-build-system/DPKG%20caching%20framework%20.ppt + - https://github.com/xumia/SONiC/blob/repd3/doc/sonic-build-system/SONiC-Reproduceable-Build.md + +# Feature Requirements + - Feature should support build improvements in overall SONiC build. + - Enhances the build to run in more parallel mode and optimize the time consuming build paths. + +## Functional Requirements + +Following requirements are addressed by the design presented in this document: + +- Multiuser mode support: + - Add a feature in the build infra to support the multiuser container build using native docker mode. + - Option to enable/disable the Native docker mode. + - Use Jinja template to render the per user sonic Dockerfile.j2 + - Use OverlayFS to virtualize the build root to resolve inter target dependency. + +- Build optimization: + - Build optimizatoin for binary image generation. + - Pass dh '-parallel' option to all the make targets. + - Add caching support for binary image. + - Add support for build time dependency over overlayFS support. + - Use tmpfs and OverlayFS to speed up the per target build process. + +- Caching Requirements: + - Sonic image is built by pulling binary and source components from various sources. + - Debian repo, python repo, docker repo, web repo, git module and go module repo. + - Requires flexibility to select the different versions of a component. + - Sonic development is diverged into multiple development branches. + - Each development branch needs different version of build components. + - Sonic moves to latest after every release. + - Release branch needs fixed version of build components as the prebuilt binary and source packages are keep moving to the latest version + - Requires Caching/Mirroring support. + - Component changes outside the SONIC repo which causes frequent build failures. + - Unavailability of external site causes the dependency build failure. + - Flexibility to switch between fixed version vs latest version. + - Different branch can freeze different´ set of versions. + - Still, Individual package should be upgraded to selected versions. + - Versions cache should be enabled/disabled globally. + + +## Configuration and Management Requirements + +NA + +## Scalability Requirements + +NA + +## Warm Boot Requirements + +NA + + +# Feature Description + +This feature provides build improvements in SONIC. + +# Feature Design +## Design Overview +## Multi user Build +### Native docker mode +- Docker supports two types of mode to run a container. + - Docker-in-Docker(DinD) mode + - Native Docker or Docker-out-of-Docker(DooD) mode + +- Docker-In-Docker mode. + - Installing and running another Docker engine (daemon) inside a Docker container. + - Since Docker 0.6, a "privileged" option is added to allow running containers in a special mode with almost all capabilities of the host machine, including kernel features and devices acccess. + - As a consequence, Docker engine, as a privileged application, can run inside a Docker container itself. + - Docker-in-Docker solution is not recommented, especially in containerized Jenkins systems as potential problems include + - Security profile of inner Docker will conflict with one of outer Docker + - Incompatible file systems (e.g. AUFS inside Docker container). + - As a workaround to address these problems using: + - Container creation using dind docker solutions. + - To use AUFS in the inner Docker, just promote /var/lib/docker to inner docker. + - Apar´t from the security aspect, a lot of performace panalities are involved as it uses the UnionFS/OverlayFS that degrades the performace when number of lower layers are more. + - All the child container resource usage is restricted within the paraent container usage. + +- Native docker mode. + - The DoD mode uses socket file(/var/run/docker.sock) to communitcate with host dockerd daemon. + - It uses the shared socket file between HOST and the container to run the build container. + - Eg: docker run -v /var/run/docker.sock:/var/run/docker.sock ... + - When a new docker container/builder/composer is invoked from a build container: + - It is started as a sibiling to build container. + - It will run in parallel with build container. + - This mode provides a better performance as it can utilize the full potential of host machine. + +#### Build Container in SONiC: +- The current SONiC build infrastructure generats all the SONiC build artifacts inside the docker container environment. When docker is isolated from the host CPU, the docker resource usage and filesystem access are restricted from its full capacity. Docker isolation is more essential for application containers, but for the build containers, the more essentail requirement is the build performace rather than adopting a higher security model. It provides the better build performance when the build containers are run in native mode. +- Sonic supports both the mode of build container creation. +- The Native docker mode gives better performace but it has some limitations: + - In a shared build servers, sonic docker creation from multiple user would give conflict as it shares the same docker image name. +- This feature addresses: + - Sonic docker container creation in parallel from multiple users. + - Since it runs as sibling container, it will provide better container performace. + - As it shares the host dockerd, it gives better performance as the multilevel UNIONFS/OverlayFS is not needed. + +#### Build Container in SONiC: + + +![ Native Docker Support ](images/sonic-native-docker-support.png) + + +- Currently, the build dockers are created as a user dockers(docker-base-stretch-, etc) that are specific to each user. +- But the sonic dockers (docker-database, docker-swss, etc) are created with a fixed docker name and that are common to all the users. + + - docker-database:latest + - docker-swss:latest + +- When multiple builds are triggered on the same build server that creates parallel building issue because all the build jobs are trying to create the same docker with latest tag. This happens only when sonic dockers are built using native host dockerd for sonic docker image creation. + +- This feature creates all sonic dockers with user tag. +- While saving and loading the sonic dockers, it rename the sonic dockers with appropriate user tag. +- Docker image Load and Save operations are protected with global lock. +- The user tag is created with combination of user name and SHA ID of Docker control files(Dockerfile.j2, etc) +- Different user tag is genenrated for a different branch of the same user. + +- Docker image save sequence protected with lock as bellow, + - docker_image_lock() + - docker tag docker-name-\:latest docker-name:latest + - docker save docker-name:latest > docker-name-\.gz + - docker rm docker-name:latest + - docker_image_unlock() + +- Docker image load sequence protected with lock as bellow, + - docker_image_lock() + - docker load -i < docker-name-\.gz + - docker tag docker-name:latest docker-name-\:latest + - docker rm docker-name:latest + - docker_image_unlock() + +- The user sonic docker names are derived from '_LOAD_DOCKERS' make variable and using Jinja template, it replaces the FROM docker name with correct user sonic docker name for loading and saving the docker image. + +- The template processing covers only for common dockers, Broadcom and VS platform dockers. For other vendor specific dockers, respective vendors need to add the support. + +### Target Specific Build Root + +- OverlayFS allows multiple virtual rootfs creation for target specific build. +- Virtual bulid root - Merging the container root(/) and SONiC source(/sonic) and mounted into target specific virutal build root using OverlayFS. +- Use tmpfs mount for better performance. +- This would avoid the target specific(UNINSTALLS) and improve the parallel build performance. +![Virtual Build Root](images/virtual-build-root.png) + - \# mkdir -p BUILD + - \# mkdir -p BUILD + - \# mount -t tmpfs -o size=1G tmpfs BUILD (optional - performace) + - \# mount -t overlay overlay -olowerdir=/,upperdir=/sonic/,workdir=BUILD/work BUILD/sonic-buildimage/ + - \# chroot BUILD/sonic-buildimage/ /bin/bash + - bash# mount -t proc proc /proc + - bash# dpkg -i && make + +### Parallel Make + - Propogate the DEB_BUILD_OPTIONS='--parallel' to all it sub target. + - Progation of parallel option to python pip install packages through ENV export. + +## Version cache support + +### Version components + +- Sonic build downloads lots of component from external web which includes + - Source code files + - Prebuilt debian packages + - Python PIP packages + - Git source code + - Docker images + - Go modules + - Other tools and utilities + +- These components are getting updated fequently and the changes are dynamic in nature. +- Versioning feature support the sonic build to particular version of the package to be downloaded/installed. +- Versioning ability to select the particular package version, but still it will fetch the package from external world. +- When external site is down, selected package version is not available or any other issues with connecting to external site or downloading the package would lead to sonic build failure. +- Due to this dynamic nature, every sonic build might have to change its dependency chain. +- Version files are stored at files/build/versions folder as below hierarchy. +``` +files/build/versions/ +├── build +│   ├── build-sonic-slave-buster +│   │   ├── versions-deb-buster-amd64 +│   │   ├── versions-py2-buster-amd64 +│   │   └── versions-py3-buster-amd64 +├── default +│   ├── versions-docker +│   ├── versions-git +│   └── versions-web +├── dockers +│   ├── docker-base-buster +│   │   ├── versions-deb-buster-amd64 +│   │   ├── versions-py3-buster-amd64 +│   | ├── versions-git +│   | ├── versions-web + ... +``` + + +![Package Versioning](images/package-versioning.png) + +### Version Cache feature +- The version cache feature allows the sonic build system to cache all the source, binary and its dependencies into local file system. +- When version cache feature is enabled, first it checks local cache storage for requested package, if it is available, it loads from the cache else it will download from the external web. + +![Version Caching](images/version-caching.png) + +### Build Version Design +- Version control files are copied to + - To slave container for package build. + - To docker builder for sonic slave docker creation. + - To docker builder for sonic docker creation. + - To Rootfs for binary image generation. + +![ Build Version caching ](images/build-version-caching.png) + +- Based on the package version, corresponding file will be fetched from the cache if exists. +- Otherwise the file will be downloaded from the web and cache will be updated with newer version. +- Version cache feature supports caching for following build components. + - DPKG packages + - PIP packages + - Python packages + - Wget/Curl packages + - GO modules + - GIT modules + - Docker images + +#### Debian version cache + + - Debian packages are version controlled via preference file that specify each package and corresponding version as below. + - iproute==1.0.23 + + - When deb package gets installed, it looks for the package version from the version control file. If matches, it installs the package with the specified version in the version control file. + - During the package installation, it also save the package into the below cache path. + - /var/cache/apt/archives/ + - If package is already available in the cache path, then it directly installs the package without downloading from the external site. + - With the version cache enabled, it preloads all cached packages into deb cache folder, so that any subsequent deb installation will always use the cached path. + +![ Debian Packages ](images/dpkg-version-caching.png) + +#### PIP version cache + - PIP packages are version controlled via constraint file that specify each package and corresponding version as below. + - ipaddress==1.0.23 + - + - When a pip package gets installed, it looks for the package version from the version control file. If matches, it installs the package with the specified version in the version control file. + - During the package installation, it also save the package into the cache path as below. + - pip/http/a/4/6/b/7/a46b74c1407dd55ebf9eeb7eb2c73000028b7639a6ed9edc7981950c + - If package is already available in the pip cache path, then it directly installs the package without downloading from the external site. + - With the version cache enabled, it preloads all cached packages into pip cache folder, so that any subsequent pip installation will always use the cached path. + - During pip installation, the cache path can be specified with --cache-dir option which stores the cache data in the specified directory and version constraint file is given as --constraint option. + - Pip vcache folders are created under slave container name or sonic container name appropriately. + +![ Python Packages ](images/pip-version-caching.png) + +#### Python version cache + - Python packages are created via setup.py file. + - These packages and their dependencies listed in the setup.py are version controlled via SHA id of the package. + - During python package build, python uses setup.py to scan through the dependencies and prerequisties, and then downloads and install them into .eggs folder. + - If .eggs folders already exists, it will not reinstall the dependencies. + - With version cache enabled, it stores the .eggs files into vcache as a compressed tar file. + - Cache file name is formed using SHA value of setup.py. + - During package build, if .eggs file exist already, it loads the .eggs from vcache and proceeds with package build. + +![ Python Packages ](images/python-version-caching.png) + +#### Git clones + - Git clone modules are version controlled via commit hash. + - On a git clone attempt, version control file(versions-git) is first checked to see if the attempted git clone(url) entry is present, + - if entry is not present, then it downloads from the external world and saves the the downloaded git clone as git bundle file into vcache with the commit hash in its name and also updates the version control file. + Example: cache file name is formed using url and the commit hash + https://salsa.debian.org/debian/libteam.git-f8808df228b00873926b5e7b998ad8b61368d4c5.tgz + - if entry is present but git bundle file is not available in vcache, then it downloads from the external world and saves it into vcache with the commit hash + in its name. + - if entry is present and git bundle file is available in vcache, it gets loaded, unbundled & checkedout with specific commit. + - If git clone has any submodules, it is also handled. + - The submodules' git bundles are tared along with the main bundle and stored in the vcache. On loading, this tar file will be untared first before unbundling & checking out each submodules' git bundle. + + + +![ GIT Modules ](images/git-module-version-caching.png) + +#### Docker Images + - Docker images are version controlled via its SHA id. + - During docker image creation, version control script gets executed. + - The _PULL_DOCKER variable in the docker Make rule indicates whether the docker needs to be downloaded from docker hub or not. + - version control script will look for the matching entry in version control file. + - If not present, then it downloads the image and saves in to vcache in gz format and updates the version control file. The cache filename is formed using dockername combined with SHA id. + Example: debian-stetch-sha256-7f2706b124ee835c3bcd7dc81d151d4f5eca3f4306c5af5c73848f5f89f10e0b.tgz + + - If present but not available in the cache, then it downloads the image and saves into saves in to cache in gz format. + - If present and the docker image is availabe in cache, then it preloads the docker image for container preparation. + + ![ Docker Images ](images/docker-image-version-caching.png) + + +#### Wget/Curl Packages + - wget/curl packages are controlled via URL and SHA id of the package. + - On wget attempt, version control file(versions-git) is first checked to see if the attempted url entry is present, + - if entry is not present, then it downloads from the external world and saves the the downloaded package into vcache with the SHA id of the package in its name and also updates the version control file. + Example: cache file name is formed using url and the SHA id. + https://salsa.debian.org/debian/libteam.src.gz-f8808df228b00873926b5e7b998ad8b61368d4c5.tgz + - if entry is present but package is not available in vcache, then it downloads from the external world and saves it into vcache. + - if entry is present and package is also available in vcache, it gets copied from the vcache. + +![ Wget Packages ](images/web-version-caching.png) + +#### Go modules + - In SONiC, all the go modules are installed from go.mod file. + - HASH value is calculated from the following contents: + - go.mod + - Makefile + - Common Files + - ENV flags + - It caches all the go module files as a directory structure instead of compressed tar file as it gives better performace when number of files are more. + - Different directory hierarchy is created for each HASH value. + - If HASH matches, it uses rsync to sync the cached modules to GOPATH build directory. + - While storing/retrieving, the cache content is always protected with global lock. + +![ GO Modules ](images/go-module-version-caching.png) + +## Docker Build Version Caching + +- Each docker build is version controlled via + - Dockerfile.j2 + - Makefile + - Commonfiles + - ENV flags +- SHA value is calculated from version control files. +- Cache file is created for each docker with the docker name and SHA value calculated. +- Cache file contains the following: + - Debian packages + - pip packages + - wget packages + - git packages + - go modules +- Version control script will place the cache file into appropriate location inside docker builder. +- With version control enabled, docker cache if exists already gets loaded else it will create and update the cache. +![ Docker Build Version Caching ](images/docker-build-version-caching.png) +- + +## Installer Image Build Optimization + +# Installer image generation has six stages: + + - Bootstrap generation + - ROOTFS installation + - SONiC packages installation + - SQUASHFS generation + - DockerFS generation + - Installer image generation + + + + +### Image Preparation: +- Split into two parts: + 1. Debian packages + - Bootstrap preparation + - General packages installation, such as curl, vim, sudo, python3, etc + 2. Sonic packages + - Packages that are built and installed from from sonic repo. + - Docker images that are built and installed from from sonic repo + +- Step (1) can be generated as a base image and it can be run in parallel with the other targets, before build image step. + - Benifits: + - High hit rate, for less dependencies. + - Reduce the cache size. + - Improve the concurrency when cache not hit, the step has small dependencies, can be run with any other steps. + +#### Bootstrap generation + - Debian bootstrap package files are prepared using debootstrap tool. + - It downloads set of bootstrap packages and generates the bootstrap filesystem. + - Initially, it downloads all the packages and creates them as image file and store them into version cache storage. + - Image file is created with specific filename and the HASH value. + - HASH value is calculated from SHA value of bootstrap control files which includes: + - build_debian.sh + - sonic_debian_extension.sh + - Version files + - Common makefiles and script utilities. + - Env Flags + - On the subsequent build, if calculated HASH maches with existing version cache filename, it loads the boostrap files from cache. + + +#### Rootfs preparation +![ Binary Image Generation ](images/binary-image-generation.png) + +- Rootfs files system is prepared on top of bootstrap packages. +- It is prepared by downloading the various opensource debian packages, tools and utilities that are needed for SONiC applications and install them on top of bootstrap fs. +- The rootfs file system is created as image file system and cached as part of version cache system. +- Image file is created with installer name and HASH value. +- The HASH value is calculated from SHA value of following files: + - build_debin.sh + - sonic_build_extention.j2 + - Common makefiles + - ENV flags +- On the subsequent build, mount the rootfs from image cache file if exists in version cache. +- It uses the version control to install the cached packages in one place. + +![ Binary Image Version Caching ](images/binary-image-version-caching.png) +#### SONiC packages installation +- Install all the sonic packages. +- Host services, configuration and utilities are installed. + +#### SQASHFS generation +- SquashFS is a readonly filesystem and it is created using squashfs command. +- It is a compressed version of rootfs contents. + +#### dockerfs preparation +- Dockerfs is created by importing all the sonic docker images and taring /var/log/docker folder. +- Dockerfs directory is linked to non rootfs directory by mounting an external filesystem to ROOTFS. +- Parallel loading of docker from compressed gz file. + +#### Installer Image generation +- Tar with pigz compression to get better compression speed as well as compression ratio. +- Uses the config file to choose the different compression options. + +#### Parallel build option + +- Stage build provides two stage build. + - Phase 1 - Rootfs generation as part of other package generation. + - Phase 2 - Docker generation in parallel. + +# Make variables +- The following make variable controls the version caching feature. + + - SONIC_VERSION_CONTROL_COMPONENTS= => Turn on/off the versioning + - SONIC_VERSION_CACHE_METHOD=cache=. => Turn on/off version caching + - SONIC_VERSION_CACHE_SOURCE= => Cache directory path + +# Version freeze +- Weekly/periodical with version migration to latest. + + - Build with SONIC_VERSION_CONTROL_COMPONENTS=none to generate the new set of package versions in the target. + - Run ‘make freeze’ to generate and merge the version changes into the source. + - Check-in the new version set with the source. + + +# Cache cleanup + +- Recently used cache files are updated with newer timestamp. The Cache framework automatically touch the used cache files to current timestamp. +- Touch is used to update the package to latest, so the files that are not recent, that can be cleaned up. + - touch //.tgz +- Least-recently-used cache file cleanup command: + +``` + + find -name “*.tgz” ! -mtime -7 –exec rm {} \; + + Where: + -mtime n => Files were modified within last n*24 hours . + -mtime -7 => means ( -7 * 24 ) => Files were modified within last 7 days + ! -mtime -7 => Files were modified 7 days ago + +``` + +## Build Time Compression + +### PoC Build( Buster ) +- Build Config: + - Release: Buster + - Filesystem: Local + - CPU core: 40 Core + - DPKG_CACHE: Enabled + - VERSION_CACHE: Enabled +- Build Time: + - 5 Minutes ( Bofore: >40 Minutes ) + +### BuildiTime Measurement +| **Feature** | **Normal Build** | **Build Enhacement** | +| --------------------------------- | -------------| -------------------------- | +| DPKG_CACHE=N
VERSION_CACHE=N | \ | \ | +| DPKG_CACHE=Y
VERSION_CACHE=y | \ | \ | +| DPKG_CACHE=N
VERSION_CACHE=y | \ | \ | + +# TODO: +- Migration to bullseye release + + +## References + +- Ref: + - https://github.com/Azure/SONiC/blob/master/doc/sonic-build-system/DPKG%20caching%20framework%20.ppt + - https://github.com/xumia/SONiC/blob/repd3/doc/sonic-build-system/SONiC-Reproduceable-Build.md diff --git a/doc/sonic-build-system/images/binary-image-generation.png b/doc/sonic-build-system/images/binary-image-generation.png new file mode 100644 index 0000000000..14de137acc Binary files /dev/null and b/doc/sonic-build-system/images/binary-image-generation.png differ diff --git a/doc/sonic-build-system/images/binary-image-version-caching.png b/doc/sonic-build-system/images/binary-image-version-caching.png new file mode 100644 index 0000000000..255560c250 Binary files /dev/null and b/doc/sonic-build-system/images/binary-image-version-caching.png differ diff --git a/doc/sonic-build-system/images/build-enhancements.md b/doc/sonic-build-system/images/build-enhancements.md new file mode 100644 index 0000000000..26fd832068 --- /dev/null +++ b/doc/sonic-build-system/images/build-enhancements.md @@ -0,0 +1,463 @@ + + +# Build Improvements HLD + +#### Rev 0.2 + +# Table of Contents + +- [List of Tables](#list-of-tables) +- [Revision](#revision) +- [Definition/Abbreviation](#definitionabbreviation) +- [About This Manual](#about-this-manual) +- [Introduction and Scope](#1-introduction-and-scope) + - [Current build infrastructure](#11-existingtools-limitation) + - [Benefits of this feature](#12-benefits-of-this-feature) +- [Feature Requirements](#2-feature-requirements) + - [Functional Requirements](#21-functional-requirements) + - [Configuration and Management Requirements](#22-configuration-and-management-requirements) + - [Scalability Requirements](#23-scalability-requirements) + - [Warm Boot Requirements](#24-warm-boot-requirements) +- [Feature Description](#3-feature-description) +- [Feature Design](#4-feature-design) + - [Overview](#41-design-overview) + - [Docker-in-Docker build](#42-db-changes) + - [SONIC version cache build](#42-db-changes) + - [Installer Image Optimization](#42-db-changes) +- [Serviceability and Debug](#6-serviceability-and-debug) +- [Warm reboot Support](#7-warm-reboot-support) +- [Unit Test Cases ](#8-unit-test-cases) +- [References ](#9-references) + +# List of Tables + +[Table 1: Abbreviations](#table-1-abbreviations) + +# Revision +| Rev | Date | Author | Change Description | +|:--:|:--------:|:-----------------:|:------------------------------------------------------------:| +| 0.1 | | Kalimuthu Velappan | Initial version | + + +# Definition/Abbreviation + +### Table 1: Abbreviations + +| **Term** | **Meaning** | +| -------- | ----------------------------------------- | +| DPKG | Debian Package | +| DinD | Docker-in-Docker | +| DooD | Docker-out-of-Docker | + + +# About this Manual + +This document provides general information about the build improvements in SONiC. + + +# Introduction and Scope + +This document describes the Functionality and High level design of the build improvement in SONiC. + +- The current SONiC environment uses container environment for generating the sonic packages, docker container images and installer images with rootfs. +- On every soonic build, it downloads source code, binary packages, docker images and other tools and utilities from an external world and generates the build artifacts. + +This feature provides improvements in three essential areas. + - Build container creation using native docker mode. + - Package cache support for build componets that are downloaded from external world. + - Image cache support for installer image components. + + - Version cache feature is supported on top existing versioning feature. + - ref: - [https://github.com/xumia/SONiC/blob/repd3/doc/sonic-build-system/SONiC-Reproduceable-Build.md +](url) +# Feature Requirements + +## Functional Requirements + +Following requirements are addressed by the design presented in this document: + +- Multiuser mode support: + - Add a feature in the build infra to support the multiuser container build using native docker mode. + +- Build optimization: + - Build optimizatoin for binary image generation. + - Add caching support for binary image. + - Add support for build time dependency over overlayFS support. + +- Caching Requirements: + - Sonic image is built by pulling binary and source components from various sources. + - Debian repo, python repo, docker repo, http(s) repo and go module repo. + - Requires flexibility to select the different versions of a component. + - Sonic development is diverged into multiple development branches. + - Each development branch needs different version of build components. + - Sonic moves to latest after every release. + - Release branch needs fixed version of build components as the prebuilt binary and source packages are keep moving to the latest version + - Requires Caching/Mirroring support. + - Component changes outside the SONIC repo which causes frequent build failures. + - Unavailability of external side causes the dependency build failure. + - Flexibility to switch between fixed version vs latest version. + - Different branch can freeze different set of versions. + - Still, Individual package should be upgraded to selected versions. + - Versions cache should be enabled/disabled globally. + - Unavailability of external sites should not cause the dependency build failures. + + + + + +## Configuration and Management Requirements + +NA + +## Scalability Requirements + +NA + +## Warm Boot Requirements + +NA + + +# Feature Description + +This feature provides build improvements in SONIC. + +# Feature Design +## Design Overview +- Docker supports two types of mode to run a container. + - Docker-in-Docker(DinD) mode + - Native Docker or Docker-out-of-Docker(DooD) mode + +- Docker-In-Docker mode. + - Installing and running another Docker engine (daemon) inside a Docker container. + - Since Docker 0.6, a “privileged” option is added to allow running containers in a special mode with almost all capabilities of the host machine, including kernel features and devices acccess. + - As a consequence, Docker engine, as a privileged application, can run inside a Docker container itself. + - Docker-in-Docker solution is not recommented, especially in containerized Jenkins systems as potential problems include + - Security profile of inner Docker will conflict with one of outer Docker + - Incompatible file systems (e.g. AUFS inside Docker container). + - As a workaround to address these problems using: + - Container creation using dind docker solutions. + - To use AUFS in the inner Docker, just promote /var/lib/docker to inner docker. + - Apart from the security aspect, a lot of performace panaliteis are involved as it uses the UnionFS/OverlayFS that degrades the performace when number of lower layers are more. + - All the child container resource usage is restricted within the paraent container usage. + +- Native docker mode. + - The DoD mode uses socket file(/var/run/docker.sock) to communitcate with host dockerd daemon. + - It uses the shared socket file between HOST and the container to run the build container. + - Eg: docker run -v /var/run/docker.sock:/var/run/docker.sock ... + - When a new docker container/builder/composer is invoked from a build container: + - It is started as a sibiling to build container. + - It will run in parallel with build container. + - This mode provides a better performance as it can utilize the full potential of host machine. + +### Build Container in SONiC: +- The current SONiC build infrastructure generats all the SONiC build artifacts inside the docker container environment. When docker is isolated from the host CPU, the docker resource usage and filesystem access are restricted from its full capacity. Docker isolation is more essential for application containers, but for the build containers, the more essentail requirement is the build performace rather than adopting a higher security model. It provides the better build performance when the build containers are run in native mode. +- Sonic supports both the mode of build container creation. +- The Native docker mode gives better performace but it has some limitations: + - In a shared build servers, sonic docker creation from multiple user would give conflict as it shares the same docker image name. +- This feature addresses: + - Sonic docker container creation in parallel from multiple users. + - Since it runs as sibiling container, it will degrade the parent container performace. + - As it shares the host dockerd, it gives better performance as the multilevel UNIONFS/OverlayFS is not needed. + +#### Build Container in SONiC: + + +![ Native Docker Support ](images/sonic-native-docker-support.png) + + +- Currently, the build dockers are created as user dockers(docker-base-stretch-, etc) that are specific to each user. But the sonic dockers (docker-database, docker-swss, etc) are created with a fixed docker name and that are common to all the users. + + - docker-database:latest + - docker-swss:latest + +- When multiple builds are triggered on the same build server that creates parallel building issue because all the build jobs are trying to create the same docker with latest tag. This happens only when sonic dockers are built using native host dockerd for sonic docker image creation. + +- This feature creates all sonic dockers as user sonic dockers and then, whilesaving and loading the user sonic dockers, it rename the user sonic dockers into correct sonic dockers with tag as latest. + +- The user sonic docker names are derived from '_LOAD_DOCKERS' make variable and using Jinja template, it replaces the FROM docker name with correct user sonic docker name for + loading and saving the docker image. + +- The template processing covers only for common dockers, Broadcom and VS platform dockers. For other vendor specific dockers, respective vendors need to add the support. + + +## Version cache support + +### Version components + +- Sonic build downloads lots of component from external web which includes + - Source code files + - Prebuilt debian packages + - Python PIP packages + - Git source code + - Docker images + - Go modules + - Other tools and utilities + +- These components are getting updated fequently and the changes are dynamic in nature. +- Versioning feature support the sonic build to particular version of the package to be downloaded/installed. +- Versioning ability to select the particular package version, but still it will fetch the package from external world. +- When external site is down, selected package version is not available or any other issues with connecting to external site or downloading the package would lead to sonic build failure. +- Due to this dynamic nature, every sonic build might have to change its dependency chain. +- Version files are stored at files/build/versions folder as below hierarchy. +``` +files/build/versions/ +├── build +│   ├── build-sonic-slave-buster +│   │   ├── versions-deb-buster-amd64 +│   │   ├── versions-py2-buster-amd64 +│   │   └── versions-py3-buster-amd64 +├── default +│   ├── versions-docker +│   ├── versions-git +│   └── versions-web +├── dockers +│   ├── docker-base-buster +│   │   ├── versions-deb-buster-amd64 +│   │   ├── versions-py3-buster-amd64 +│   | ├── versions-git +│   | ├── versions-web + ... +``` + +![Package Versioning](images/package-versoning.png) + +### Version Cache feature +- The version cache feature allows the sonic build system to cache all the source, binary and its dependencies into local file system. When version cache feature is enabled, first it checks local cache storage for requested package, if it is available, it loads from the cache else it will download from the external web. + +![Version Caching](images/version-caching.png) + +### Build Version Design +- Version control files are copied to + - To slave container for package build. + - To docker builder for sonic slave docker creation. + - To docker builder for sonic docker creation. + - To Rootfs for binary image generation. + +![ Build Version caching ](images/build-version-caching.png) + +- Based on the package version, corresponding file will be fetched from the cache if exists. +- Otherwise the file will be downloaded from the web and cache will be updated with newer version. +- Version cache feature supports caching for following build components. + - DPKG packages + - PIP packages + - Python packages + - Wget/Curl packages + - GO modules + - GIT modules + - Docker images + +#### Debian version cache + + - Debian packages are version controlled via preference file that specify each package and corresponding version as below. + - iproute==1.0.23 + - When deb package gets installed, it looks for the package version from the version control file. If matches, it installs the package with the specified version in the version control file. + - During the package installation, it also save the package into the below cache path. + - /var/cache/apt/archives/ + - If package is already available in the cache path, then it directly installs the package without downloading from the external site. + - With the version cache enabled, it preloads all cached packages into deb cache folder, so that any subsequent deb installation will always use the cached path. + +![ Debian Packages ](images/dpkg-version-caching.png) + +#### PIP version cache + - PIP packages are version controlled via constraint file that specify each package and corresponding version as below. + - ipaddress==1.0.23 + - When a pip package gets installed, it looks for the package version from the version control file. If matches, it installs the package with the specified version in the version control file. + - During the package installation, it also save the package into the cache path as below. + - pip/http/a/4/6/b/7/a46b74c1407dd55ebf9eeb7eb2c73000028b7639a6ed9edc7981950c + - If package is already available in the pip cache path, then it directly installs the package without downloading from the external site. + - With the version cache enabled, it preloads all cached packages into pip cache folder, so that any subsequent pip installation will always use the cached path. + - During pip installation, the cache path can be specified with --cache-dir option which stores the cache data in the specified directory and version constraint file is given as --constraint option. + - Pip vcache folders are created under slave container name or sonic container name appropriately. + +![ Python Packages ](images/pip-version-caching.png) + +#### Python version cache + - Python packages are created via setup.py file. + - These packages and their dependencies listed in the setup.py are version controlled via SHA id of the package. + - During python package build, python uses setup.py to scan through the dependencies and prerequisties, and then downloads and install them into .eggs folder. + - If .eggs folders already exists, it will not reinstall the dependencies. + - With version cache enabled, it stores the .eggs files into vcache as a compressed tar file. + - Cache file name is formed using SHA value of setup.py. + - During package build, if .eggs file exist already, it loads the .eggs from vcache and proceeds with package build. + +![ Python Packages ](images/python-version-caching.png) + +#### Git clones + - Git clone modules are version controlled via commit hash. + - On a git clone attempt, version control file(versions-git) is first checked to see if the attempted git clone(url) entry is present, + - if entry is not present, then it downloads from the external world and saves the the downloaded git clone as git bundle file into vcache with the commit hash in its name and also updates the version control file. + Example: cache file name is formed using url and the commit hash + https://salsa.debian.org/debian/libteam.git-f8808df228b00873926b5e7b998ad8b61368d4c5.tgz + - if entry is present but git bundle file is not available in vcache, then it downloads from the external world and saves it into vcache with the commit hash + in its name. + - if entry is present and git bundle file is available in vcache, it gets loaded, unbundled & checkedout with specific commit. + - If git clone has any submodules, it is also handled. + - The submodules' git bundles are tared along with the main bundle and stored in the vcache. On loading, this tar file will be untared first before unbundling & checking out each submodules' git bundle. + + + +![ GIT Modules ](images/git-module-version-caching.png) + +#### Docker Images + - Docker images are version controlled via its SHA id. + - During docker image creation, version control script gets executed. + - The _PULL_DOCKER variable in the docker Make rule indicates whether the docker needs to be downloaded from docker hub or not. + - version control script will look for the matching entry in version control file. + - If not present, then it downloads the image and saves in to vcache in gz format and updates the version control file. The cache filename is formed using dockername combined with SHA id. + Example: debian-stetch-sha256-7f2706b124ee835c3bcd7dc81d151d4f5eca3f4306c5af5c73848f5f89f10e0b.tgz + + - If present but not available in the cache, then it downloads the image and saves into saves in to cache in gz format. + - If present and the docker image is availabe in cache, then it preloads the docker image for container preparation. + + ![ Docker Images ](images/docker-image-version-caching.png) + + +#### Wget/Curl Packages + - wget/curl packages are controlled via URL and SHA id of the package. + - On wget attempt, version control file(versions-git) is first checked to see if the attempted url entry is present, + - if entry is not present, then it downloads from the external world and saves the the downloaded package into vcache with the SHA id of the package in its name and also updates the version control file. + Example: cache file name is formed using url and the SHA id. + https://salsa.debian.org/debian/libteam.src.gz-f8808df228b00873926b5e7b998ad8b61368d4c5.tgz + - if entry is present but package is not available in vcache, then it downloads from the external world and saves it into vcache. + - if entry is present and package is also available in vcache, it gets copied from the vcache. + +![ Wget Packages ](images/web-version-caching.png) + +#### Go modules + - In SONiC, all the go modules are installed from go.mod file. + - HASH value is calculated from the following contents: + - go.mod + - Makefile + - Common Files + - ENV flags + - It caches all the go module files as a directory structure instead of compressed tar file as it gives better performace when number of files are more. + - Different directory hierarchy is created for each HASH value. + - If HASH matches, it uses rsync to sync the cached modules to GOPATH build directory. + - While storing/retrieving, the cache content is always protected with global lock. + +![ GO Modules ](images/go-module-version-caching.png) + +## Docker Build Version Caching + +- Each docker build is version controlled via + - Dockerfile.j2 + - Makefile + - Commonfiles + - ENV flags +- SHA value is calculated from version control files. +- Cache file is created for each docker with the docker name and SHA value calculated. +- Cache file contains the following: + - Debian packages + - pip packages + - wget packages + - git packages + - go modules +- Version control script will place the cache file into appropriate location inside docker builder. +- With version control enabled, docker cache if exists already gets loaded else it will create and update the cache. +![ Docker Build Version Caching ](images/docker-build-version-caching.png) +- + +## Installer Optimization + +# Installer image generation has six stages: + - bootstrap generation + - ROOTFS installation + - SONiC packages installation + - SQASHFS generation + - DockerFS generation + - Installer image generation + +#### Bootstrap generation + - Debian bootstrap package files are prepared using debootstrap tool. + - It downloads set of bootstrap packages and generates the bootstrap filesystem. + - Initially, it downloads all the packages and creates them as image file and store them into version cache storage. + - Image file is created with specific filename and the HASH value. + - HASH value is calculated from SHA value of bootstrap control files which includes: + - build_debian.sh + - sonic_debian_extension.sh + - Version files + - Common makefiles and script utilities. + - Env Flags + - On the subsequent build, if calculated HASH maches with existing version cache filename, it loads the boostrap files from cache. + + +#### Rootfs preparation +- Rootfs files system is prepared on top of bootstrap packages. +- It is prepared by downloading the various opensource debian packages, tools and utilities that are needed for SONiC applications and install them on top of bootstrap fs. +- The rootfs file system is created as image file system and cached as part of version cache system. +- Image file is created with installer name and HASH value. +- The HASH value is calculated from SHA value of following files: + - build_debin.sh + - sonic_build_extention.j2 + - Common makefiles + - ENV flags +- On the subsequent build, mount the rootfs from image cache file if exists in version cache. +- It uses the version control to install the cached packages in one place. + +![ Binary Image Version Caching ](images/binary-image-version-caching.png) +#### SONiC packages installation +- Install all the sonic packages. +- Host services, configuration and utilities are installed. + +#### SQASHFS generation +- SquashFS is a readonly filesystem and it created using squashfs command. +- It is a compressed version of rootfs contents. + +#### dockerfs preparation +- Dockerfs is created by importing all the sonic docker images and taring /var/log/docker folder. +- Dockerfs directory is linked to non rootfs directory by mounting an external filesystem to ROOTFS. +- Parallel loading of docker from compressed gz file. + +#### Installer Image generation +- Tar with pigz compression to get better compression speed as well as compression ratio. +- Uses the config file to choose the different compression options. + +#### Parallel build option + +- Stage build provides two stage build. + - Phase 1 - Rootfs generation as part of other package generation. + - Phase 2 - Docker generation in parallel. + +# Version freeze +- Weekly/periodical with version migration to latest. + + - Build with SONIC_VERSION_CONTROL_COMPONENTS=none to generate the new set of package versions in the target. + - Run ‘make freeze’ to generate and merge the version changes into the source. + - Check-in the new version set in the source. + +# Make variables +- The following make variable controls the version caching feature. + + - SONIC_VERSION_CONTROL_COMPONENTS= => Turn on/off the versioning + - SONIC_VERSION_CACHE_METHOD=cache=. => Turn on/off version caching + - SONIC_VERSION_CACHE_SOURCE= => Cache directory path + +# Cache cleanup + +- Recently used cache files are updated with newer timestamp. The Cache framework automatically touch the used cache files to current timestamp. +- Touch is used to update the package to latest, so the files that are not recent, that can be cleaned up. + - touch //.tgz +- Least-recently-used cache file cleanup command: + +``` + + find -name “*.tgz” ! -mtime -7 –exec rm {} \; + + Where: + -mtime n => Files were modified within last n*24 hours . + -mtime -7 => means ( -7 * 24 ) => Files were modified within last 7 days + ! -mtime -7 => Files were modified 7 days ago + +``` + +## Build Time Compression + +| **Feature** | **Normal Build** | **Build Enhacement** | +| --------------------------------- | -------------| -------------------------- | +| DPKG_CACHE=N
VERSION_CACHE=N | \ | \ | +| DPKG_CACHE=Y
VERSION_CACHE=y | \ | \ | +| DPKG_CACHE=N
VERSION_CACHE=y | \ | \ | + +## References +https://github.com/xumia/SONiC/blob/repd3/doc/sonic-build-system/SONiC-Reproduceable-Build.md diff --git a/doc/sonic-build-system/images/build-version-caching.png b/doc/sonic-build-system/images/build-version-caching.png new file mode 100644 index 0000000000..fc270f6609 Binary files /dev/null and b/doc/sonic-build-system/images/build-version-caching.png differ diff --git a/doc/sonic-build-system/images/docker-build-version-caching.png b/doc/sonic-build-system/images/docker-build-version-caching.png new file mode 100644 index 0000000000..bd03214a00 Binary files /dev/null and b/doc/sonic-build-system/images/docker-build-version-caching.png differ diff --git a/doc/sonic-build-system/images/docker-image-version-caching.png b/doc/sonic-build-system/images/docker-image-version-caching.png new file mode 100644 index 0000000000..c8f439f4f1 Binary files /dev/null and b/doc/sonic-build-system/images/docker-image-version-caching.png differ diff --git a/doc/sonic-build-system/images/dpkg-version-caching.png b/doc/sonic-build-system/images/dpkg-version-caching.png new file mode 100644 index 0000000000..72e50f67e1 Binary files /dev/null and b/doc/sonic-build-system/images/dpkg-version-caching.png differ diff --git a/doc/sonic-build-system/images/git-module-version-caching.png b/doc/sonic-build-system/images/git-module-version-caching.png new file mode 100644 index 0000000000..f1377b33db Binary files /dev/null and b/doc/sonic-build-system/images/git-module-version-caching.png differ diff --git a/doc/sonic-build-system/images/go-module-version-caching.png b/doc/sonic-build-system/images/go-module-version-caching.png new file mode 100644 index 0000000000..2e167ad1c5 Binary files /dev/null and b/doc/sonic-build-system/images/go-module-version-caching.png differ diff --git a/doc/sonic-build-system/images/package-versioning.png b/doc/sonic-build-system/images/package-versioning.png new file mode 100644 index 0000000000..57bddda576 Binary files /dev/null and b/doc/sonic-build-system/images/package-versioning.png differ diff --git a/doc/sonic-build-system/images/package-versoning.png b/doc/sonic-build-system/images/package-versoning.png new file mode 100644 index 0000000000..4a4d2fb7ef Binary files /dev/null and b/doc/sonic-build-system/images/package-versoning.png differ diff --git a/doc/sonic-build-system/images/pip-version-caching.png b/doc/sonic-build-system/images/pip-version-caching.png new file mode 100644 index 0000000000..08e7caddc7 Binary files /dev/null and b/doc/sonic-build-system/images/pip-version-caching.png differ diff --git a/doc/sonic-build-system/images/python-version-caching.png b/doc/sonic-build-system/images/python-version-caching.png new file mode 100644 index 0000000000..8dfec4b78a Binary files /dev/null and b/doc/sonic-build-system/images/python-version-caching.png differ diff --git a/doc/sonic-build-system/images/sonic-native-docker-support.png b/doc/sonic-build-system/images/sonic-native-docker-support.png new file mode 100644 index 0000000000..8f3c463e07 Binary files /dev/null and b/doc/sonic-build-system/images/sonic-native-docker-support.png differ diff --git a/doc/sonic-build-system/images/version-caching.png b/doc/sonic-build-system/images/version-caching.png new file mode 100644 index 0000000000..86593bccca Binary files /dev/null and b/doc/sonic-build-system/images/version-caching.png differ diff --git a/doc/sonic-build-system/images/virtual-build-root.png b/doc/sonic-build-system/images/virtual-build-root.png new file mode 100644 index 0000000000..64584cd28f Binary files /dev/null and b/doc/sonic-build-system/images/virtual-build-root.png differ diff --git a/doc/sonic-build-system/images/web-version-caching.png b/doc/sonic-build-system/images/web-version-caching.png new file mode 100644 index 0000000000..561857bcbc Binary files /dev/null and b/doc/sonic-build-system/images/web-version-caching.png differ diff --git a/doc/sonic-build-system/img/sai-sonic-build-system.drawio.png b/doc/sonic-build-system/img/sai-sonic-build-system.drawio.png new file mode 100755 index 0000000000..93fce0634b Binary files /dev/null and b/doc/sonic-build-system/img/sai-sonic-build-system.drawio.png differ diff --git a/doc/sonic-build-system/img/sonic-sairedis-check.drawio.png b/doc/sonic-build-system/img/sonic-sairedis-check.drawio.png new file mode 100755 index 0000000000..a9e1cbad72 Binary files /dev/null and b/doc/sonic-build-system/img/sonic-sairedis-check.drawio.png differ diff --git a/doc/sonic-build-system/saiversioncheck.md b/doc/sonic-build-system/saiversioncheck.md new file mode 100644 index 0000000000..67f077b018 --- /dev/null +++ b/doc/sonic-build-system/saiversioncheck.md @@ -0,0 +1,84 @@ +# SAI API version check + +## Motiviation + +SONiC is not desing to work in backward compatibility with older vendor SAI implementations. +SAI headers that SONiC's synd daemon is compiled against are taken from OCP SAI repository while +the actual libsai.so is taken from sonic-buildimage vendor's directory. This leads to a situation +that sometimes SAI in sonic-sairedis repository is updated but vendor SAI in sonic-buildimage is not. + +This may lead to: + - Compilation failure because of ABI changes (syncd cannot be successfully linked with libsai.so) + - Attributes ID mismatch, as we add new attributes in a non-backward compatible manner. The result is syncd termination due to invalid usage of attributes or hidden incorrect behavior. + - Enum values mismatch, as we add new values to enums in a non-backward compatible manner. + - Etc. + + +## SONiC buildsystem overview + +This is an illustration how the build system works: + +

+Figure 1. SONiC build +

+ +Sonic-sairedis contains syncd source code. Syncd is compiled against SAI headers from sonic-sairedis repository and then linked against vendor libsai.so from sonic-buildimage repository. +In case someone updates sonic-sairedis with new SAI headers and tries to update submodule in sonic-buildimage PR checkers that perform sonic build should. +The one who wants to update SAI version needs to make sure all SAI vendor implementations are updated in the same PR to not break the image. + +It is also worth to mention that some vendors just provide the binary libsai.so unlike Nvidia where we have SAI headers that are provided by Mellanox-SAI repository. + +## Proposal + +SAI already has SAI_API_VERISON define in headers (saiversion.h): + +```c +#define SAI_MAJOR 1 +#define SAI_MINOR 9 +#define SAI_REVISION 1 + +#define SAI_VERSION(major, minor, revision) (10000 * (major) + 100 * (minor) + (revision)) + +#define SAI_API_VERSION SAI_VERSION(SAI_MAJOR, SAI_MINOR, SAI_REVISION) +``` + +Currently, given just the libsai.so file, it is not possible to know which SAI headers it was compiled against, as these defines are in the headers. +We need an API in libisai.so to get the API version that this libsai.so implementation is aligned to. + +The proposal is to add such API: + +```c +/** + * @brief Retrieve a SAI API version this implementation is aligned to + * + * @param[out] version Version number + * + * @return #SAI_STATUS_SUCCESS on success, failure status code on error + */ +sai_status_t sai_query_api_version( + _Out_ sai_api_version_t *version); +``` + +The implementation is simple: + +```c +sai_status_t sai_query_api_version( + _Out_ sai_api_version_t *version) +{ + *version = SAI_API_VERSION; + return SAI_STATUS_SUCCESS; +} +``` + +This SAI_API_VERSION is the one derived from headers that were used by vendor SAI (headers on the left on the Figure 1.). + +Using that new API we can implement a configure-time check in sonic-sairedis with autotools AC_TRY_RUN: + +

+Figure 2. SONiC sairedis check +

+ +The check will compare the vendor SAI API version (on the left on the Figure 1) with sairedis SAI API version (on the right on the Figure 2.) and fail if they do not match. +In case, SAI versioning follows sematical versioning rules, the test program can implement a check for only MAJOR and MINOR version, relaxing the constraint on the PATCH version. + +## Questions diff --git a/doc/sonic-multi-architecture/sonic_arm_support.md b/doc/sonic-multi-architecture/sonic_arm_support.md new file mode 100644 index 0000000000..b6ce22318e --- /dev/null +++ b/doc/sonic-multi-architecture/sonic_arm_support.md @@ -0,0 +1,231 @@ +# SONIC ARM Architecture support + +[![Marvell Technologies](https://www.marvell.com/content/dam/marvell/en/rebrand/marvell-logo3.svg)](https://www.marvell.com/) + +# Description + + - This document describes enhancement in SONIC build script to support ARM32 and ARM64 + +Support for ARM architecture needs changes in the following modules + + - sonic-slave + - dockers + - rules + - Makefile + - Buildscript + - Repo list + - Onie Build + + + +### User Input + +Similar to configuring the platform in the Make, architecture should be user driven. + +* [SONIC_ARCH] - make configure PLATFORM=[ASIC_VENDOR] PLATFORM_ARCH=[armhf] +* Default is X86_64 + +### Dockers +Since all the modules and code are compiled inside docker environment, the docker image should be based on multiarch/[distribution]-[arm_arch] + +Below dockers use the debian distribution which will now be based on the CPU Architecture distribution. +```sh +dockers/docker-base +dockers/docker-base-stretch +dockers/docker-ptf +``` + +### Developer Notes +Following are the variables used in make files +PLATFORM_ARCH : specifies the target architecture, if not set amd64 is chosen +CONFIGURED_ARCH : In Makefiles, no where amd64 should be hardcoded, instead $(CONFIGURED_ARCH) has to be used +```sh +Example: in place of amd64 in below target CONFIGURED_ARCH is replaced +LINUX_IMAGE = linux-image-$(KVERSION)_$(KERNEL_VERSION)-$(KERNEL_SUBVERSION)_amd64.deb +LINUX_IMAGE = linux-image-$(KVERSION)_$(KERNEL_VERSION)-$(KERNEL_SUBVERSION)_$(CONFIGURED_ARCH).deb +``` + + +### SONIC Slave Docker + +sonic-slave docker provides build environment for the rest of the dockers, it should be able to run the different architecture on the host cpu architecture. + +To do such cross compilation, we can make use of binfmt-misc to run target arch binary using qemu-static binary to run on the host cpu architecture. + +```sh +sonic-slave-arm64 +sonic-slave-armhf +``` + +qemu static binaries need to be installed and docker for multiarch/qemu-user-static:register is enabled to run. + +### Miscellaneous + +Architecture specific packages need to installed or ignored. +Like ixgbe and grub are specific to X86 architecture, which need to be excluded. + + +### Platform + +Same platform or board can have variants in CPU vendor. To address this, platform can be made ARCH specific, and customized changes can be added in this platform specific make infra. + +```sh +platform/marvell-armhf/docker-syncd-mrvl-rpc.mk +platform/marvell-armhf/docker-syncd-mrvl-rpc/99-syncd.conf +platform/marvell-armhf/docker-syncd-mrvl-rpc/Dockerfile.j2 +platform/marvell-armhf/docker-syncd-mrvl-rpc/ptf_nn_agent.conf +platform/marvell-armhf/docker-syncd-mrvl.mk +platform/marvell-armhf/docker-syncd-mrvl/Dockerfile.j2 +platform/marvell-armhf/docker-syncd-mrvl/start.sh +platform/marvell-armhf/docker-syncd-mrvl/supervisord.conf +platform/marvell-armhf/docker-syncd-mrvl/syncd.sh +platform/marvell-armhf/libsaithrift-dev.mk +platform/marvell-armhf/linux-kernel-armhf.mk +platform/marvell-armhf/one-image.mk +platform/marvell-armhf/platform.conf +platform/marvell-armhf/rules.mk +platform/marvell-armhf/sai.mk +platform/marvell-armhf/sai/Makefile +``` + +#### Rule/makefile + +Hardcoded "amd64" need to be replaced with Makefile variable which hold the target architecture. +* amd64 +* armhf +* arm64 + +```sh +rules/bash.mk +rules/docker-base-stretch.mk +rules/docker-base.mk +rules/docker-ptf.mk +rules/docker-snmp-sv2.mk +rules/frr.mk +rules/gobgp.mk +rules/hiredis.mk +rules/iproute2.mk +rules/isc-dhcp.mk +rules/libnl3.mk +rules/libteam.mk +rules/libyang.mk +rules/linux-kernel.mk +rules/lldpd.mk +rules/lm-sensors.mk +rules/mpdecimal.mk +rules/python3.mk +rules/quagga.mk +rules/radvd.mk +rules/redis.mk +rules/sairedis.mk +rules/smartmontools.mk +rules/snmpd.mk +rules/socat.mk +rules/swig.mk +rules/swss-common.mk +rules/swss.mk +rules/tacacs.mk +rules/telemetry.mk +rules/thrift.mk +slave.mk +src/bash/Makefile +src/hiredis/Makefile +src/iproute2/Makefile +src/isc-dhcp/Makefile +src/libnl3/Makefile +src/libteam/Makefile +src/lm-sensors/Makefile +src/mpdecimal/Makefile +src/python3/Makefile +src/radvd/Makefile +src/redis/Makefile +src/smartmontools/Makefile +src/snmpd/Makefile +src/socat/Makefile +src/tacacs/nss/Makefile +src/tacacs/pam/Makefile +src/thrift/Makefile + +``` + +### Repo list +Below repo sources list need to updated as the azure debian repo doesn't have arm packages + + +```sh +files/apt/sources.list-armhf +files/build_templates/sonic_debian_extension.j2 + +``` + +#### Onie Image + +Onie image configuration and build script should be updated for the uboot specific environment for ARM. +Update target platform for Onie image platform configuration in onie image conf. + - onie-image.conf for AMD64 + - onie-image-armhf.conf for ARMHF + - onie-image-arm64.conf for ARM64 +Onie platform config file will chosed based on the target platform + - platform//platform.conf + platform.conf will be used by the onie installer script to install the onie image +Onie Installer scripts + - installer/x86_64/install.sh + - installer/arm64/install.sh + - installer/armhf/install.sh + +SONIC Image installation is driven by these onie installer scripts which does + - Boot loader update with image boot details + - Partition the primary disk if not formatted/partitioned + - Extract sonic image in the mounted disk under /host directory + +For different platforms, the primary storage device may vary, unlike X86 platforms which mainly use varieant of sata disks, +ARM platform can also use NAND/NOR flash or SD/MMC cards +The platform dependent partition scheme is moved to platform//platform.conf, where +selecting primary storage medium, partitioning, formatting, and mounting is taken care. +The mount path is provided to the generic SONIC installer script, which does common functionalities of extracting image, and copying files. + +X86 uses grub as its bootloader, where ARM can use Uboot or proprietary bootloaders. +Bootloader configuration for boot image details are also updated in platform.conf + +#### Sonic Installer + +SONIC upgrade from SONIC uses python scripts to access bootloader configuration to update the boot image details, to support +image upgrade, image deletion, and change boot order. +For ARM Uboot firmware utilities is used to access boot configuration, as in grub for X86. + - sonic_installer/main.py + +### Kernel ARM support + +Submodule sonic-linux-kernel Makefile and patch need to be updated to compile for respective ARM architecture. As kernel .config will be generated using debian build infra, dpkg env variables need to properly updated to select the architecture. + + - src/sonic-linux-kernel + +### Custom Kernel (Expert Mode) + +Based on architecture the linux kernel may vary and need to be changed to custom kernel rather that the SONIC default kernel version. +This can be addressed in platform specific makefiles. + + - platform/marvell-armhf/linux-kernel-armhf.mk + + +### Usage for ARM Architecture +To build Arm32 bit for (ARMHF) plaform + + # Execute make configure once to configure ASIC and ARCH + make configure PLATFORM=[ASIC_VENDOR] SONIC_ARCH=armhf + **example**: + make configure PLATFORM=marvell-armhf SONIC_ARCH=armhf + +To build Arm64 bit for plaform + + # Execute make configure once to configure ASIC and ARCH + make configure PLATFORM=[ASIC_VENDOR] SONIC_ARCH=arm64 + **example**: + make configure PLATFORM=marvell-arm64 SONIC_ARCH=arm64 + +---- +Author +====== +Antony Rheneus [arheneus@marvell.com] +Copyright Marvell Technologies + diff --git a/doc/srv6/images/Srv6ConfigDBFrr.png b/doc/srv6/images/Srv6ConfigDBFrr.png new file mode 100644 index 0000000000..40b9b5b656 Binary files /dev/null and b/doc/srv6/images/Srv6ConfigDBFrr.png differ diff --git a/doc/srv6/images/Srv6Example.png b/doc/srv6/images/Srv6Example.png new file mode 100644 index 0000000000..78bdafccd2 Binary files /dev/null and b/doc/srv6/images/Srv6Example.png differ diff --git a/doc/srv6/images/drawing-configdb-frr3.png b/doc/srv6/images/drawing-configdb-frr3.png new file mode 100644 index 0000000000..2a1cb9359f Binary files /dev/null and b/doc/srv6/images/drawing-configdb-frr3.png differ diff --git a/doc/srv6/images/srv6db.png b/doc/srv6/images/srv6db.png new file mode 100644 index 0000000000..f86ae95e0d Binary files /dev/null and b/doc/srv6/images/srv6db.png differ diff --git a/doc/srv6/images/srv6orch.png b/doc/srv6/images/srv6orch.png new file mode 100644 index 0000000000..1535c9a63f Binary files /dev/null and b/doc/srv6/images/srv6orch.png differ diff --git a/doc/srv6/srv6_hld.md b/doc/srv6/srv6_hld.md new file mode 100644 index 0000000000..6cc62a9b84 --- /dev/null +++ b/doc/srv6/srv6_hld.md @@ -0,0 +1,598 @@ +# Segment Routing over IPv6 (SRv6) HLD + +# Table of Contents + +- [List of Tables](#list-of-tables) +- [Revision](#revision) +- [Definition/Abbreviation](#definitionabbreviation) +- [About This Manual](#about-this-manual) +- [1 Introuduction and Scope](#1-introuduction-and-scope) +- [2 Feature Requirements](#2-feature-requirements) +- [2.1 Functional Requirements](#21-functional-requirements) +- [2.2 Configuration and Managment Requirements](#22-configuration-and-management-requirements) +- [2.3 Warm Boot Requirements](#23-warm-boot-requirements) +- [3 Feature Design](#3-feature-design) +- [3.1 ConfigDB Changes](#31-configdb-changes) +- [3.2 AppDB Changes](#32-appdb-changes) +- [3.3 Orchestration Agent Changes](#33-orchestration-agent-changes) +- [3.4 SAI](#34-sai) +- [3.5 YANG Model](#35-yang-model ) +- [4 Unit Test](#4-unit-test) +- [5 References ](#5-references) + +# Revision + +| Rev | Date | Author | Change Description | +| :--: | :-------: | :------------------------: | :---------------------: | +| 0.1 | 6/5/2021 | Heidi Ou, Kumaresh Perumal | Initial version | +| 0.2 | 8/24/2021 | Dong Zhang | More explanation | +| 0.3 | 10/15/2021| Kumaresh Perumal | Minor updates | +| 0.4 | 10/26/2021| Kumaresh Perumal | Update MY_SID table. | + + +# Definition/Abbreviation + +### Table 1: Abbreviations + +| ****Term**** | ****Meaning**** | +| -------- | ----------------------------------------- | +| BFD | Bidirectional Forwarding Detection | +| BGP | Border Gateway Protocol | +| BSID | Binding SID | +| G-SID | Generalized Segment Identifier | +| SID | Segment Identifier | +| SRH | Segment Routing Header | +| SRv6 | Segment Routing IPv6 | +| TE | Traffic Engineering | +| uSID | Micro Segment | +| VNI | VXLAN Network Identifier | +| VRF | Virtual Routing and Forwarding | + +# About this Manual + +This document provides general information about the Segmentation Routing over IPv6 feature implementation in SONiC. It is based on IETF RFC 8754 and RFC 8986. + +# 1 Introuduction and Scope + +This document describes the Functionality and High level design of the SRv6 feature. + +SRv6 has been widely adopted as an IPv6 based SDN solution, which provides programming ability, TE capabilities, and deployment simplicity to network administrators. With current support from a rich ecosystem, including major ASIC manufactures, networking vendors and open source communities, the deployment of SRv6 is accelerating. We want to add SRv6 into SONIC to benefit users in DC as well as beyond DC. + +The following are some use cases for SRv6 deployment: + +- v4/6VPN, EVPN over best-effort +- Traffic steering over TE policy + +In SRv6 domain, TE policy associated with SID list could be configured on headend nodes, to steer traffic with SRH encapsulation. When traffic reaches egress nodes, the packets are processed based on local defined functions, for example SID list decapsulation and FIB lookup in a particular VRF . + +# 2 Feature Requirements + +## 2.1 Functional Requirements + +This section describes the SONiC requirements for SRv6 feature in phases: + +At a high level the following should be supported: + +Phase #1 + +​ Should be able to perform the role of SRv6 domain headend node, and endpoint node, more specific: +- Support END, Endpoint function - The SRv6 instantiation of a prefix SID +- Support END.DT46, Endpoint with decapsulation and IP table lookup - IP L3VPN use (equivalent of a per-VRF VPN label) +- Support H.Encaps.Red, H.Encaps with Reduced Encapsulation +- Support traffic steering on SID list + +Later phases: +- Support H.Encaps, SR Headend Behavior with Encapsulation in an SR Policy +- Support END.B6.Encaps, Endpoint bound to an SRv6 encapsulation Policy - SRv6 instantiation of a Binding SID +- Support END.B6.Encaps.Red, END.B6.Encaps with reduced SRH insertion - SRv6 instantiation of a Binding SID +- Support END.X, Endpoint function with Layer-3 cross-connect - The SRv6 instantiation of a Adj SID +- Support uSID/G-SID +- Other programming functions +- Support HMAC option +- Support sBFD for SRv6 +- Support anycast SID + +This document will focus on Phase #1, while keep the design extendable for future development + +## 2.2 Configuration and Management Requirements + +1. User should be able to enable SRv6 globally + +2. User should be able to configure SID list for encapsulation + +3. User should be able to configure SRv6 steering policy + +4. User should be able to configure endpoint action and corresponding argument for matched local SID + +## 2.3 Warm Boot Requirements + +Warm reboot is intended to be supported for planned system, swss and BGP warm reboot. + + + +# 3 Feature Design + +![draw-configdb](images/Srv6ConfigDBFrr.png) + +Before FRR is ready, we will use static configuration to set SIDs and apply policy for TE. It enables basic SRv6 operation and populates SRv6 into ASIC, allows SRv6 data plane forwarding. More complicated SRv6 policy can be enabled when SRv6 is fully supported in FRR and passed from FRR to fpmsyncd. + +For Phase#1, Controller will update SRV6 related tables in APPL_DB directly using Translib and other SONiC management framework. Sonic-swss python scripts are also used to update SRV6 APPL_DB tables. + +## 3.1 ConfigDB Changes + +**SRV6_SID_LIST_TABLE** + +Description: New table that stores SRv6 SID list configuration. + +Schema: + +``` +; New table +; holds SRv6 SID list + +key = SRV6_SID_LIST|segment_name + ; SID segment name +; field = value +path = SID, ; List of SIDs + +For example: + "SRV6_SID_LIST": { + "seg1": { + "path": [ + "BABA:1001:0:10::", + "BABA:1001:0:20:F1::" + ] + }, + "seg2": { + "path": [ + "BABA:1001:0:30::", + "BABA:1001:0:40:F1::" + ] + } + } +``` + +**SRV6_MY_SID_TABLE** + +Description: New table to hold local SID to behavior mapping + +Schema: + +``` +; New table +; holds local SID to behavior mapping, allow 1:1 or n:1 mapping + +key = SRV6_MY_SID_TABLE|ipv6address +; field = value +block_len = blen ; bit length of block portion in address, default 40 +node_len = nlen ; bit length of node ID portion in address, default 24 +func_len = flen ; bit length of function portion in address, default 16 +arg_len = alen ; bit length of argument portion in address +action = behavior ; behaviors defined for local SID +vrf = VRF_TABLE.key ; VRF name for END.DT46, can be empty +adj = address, ; Optional, list of adjacencies for END.X +policy = SRV6_POLICY.key ; Optional, policy name for END.B6.ENCAP +source = address, ; Optional, list of src addrs for encap for END.B6.ENCAP + +For example: + "SRV6_MY_SID_TABLE" : { + "BABA:1001:0:20:F1::" : { + "action": "end.dt46", + "vrf": "VRF-1001" + }, + "BABA:1001:0:40:F1::" : { + "action": "end.dt46", + "vrf": "VRF-1001" + }, + "BABA:1001:0:20:F2::" : { + "action": "end.x", + "adj": [ + BABA:2001:0:10::1, + BABA:2001:0:10::2 + ], + }, + "BABA:1001:0:20:F3::" : { + "action": "end.b6.encap", + "policy": "policy1" + "source": "A::1" + } + } +``` + +**SRV6_POLICY_TABLE** + +Description: New table that stores SRv6 policy . + +Schema: + +``` +; New table +; holds SRv6 policy + +key = SRV6_POLICY|policy_name + +; field = value +segment = SRv6_SID_LIST.key, ; List of segment names + +For example: + "SRV6_POLICY": { + "policy1": { + "segment": ["seg1", "seg2"] + }, + "policy2": { + "segment": ["seg1"] + } + } +``` + +**SRV6_STEER_MAP** + +Description: New table that stores prefix to policy mapping . + +Schema: + +``` +; New table +; holds prefix to SRv6 SID list encapsulation mapping + +key = SRV6_STEER|VRF_NAME:prefix + ; Prefix to be steered +; field = value +policy = SRV6_POLICY.key ; Policy to steer the prefix +source = address ; Source addresses for encapsulation + +For example: + "SRV6_STEER": { + "Vrf-red|11.11.11.0/24": { + "policy": "policy1", + "source": "A::1" + }, + "Vrf-blue|2001:a::0/64": { + "policy": "policy2", + "source": "A::1" + } + } +``` + +## 3.2 AppDB changes + +**New SRV6_SID_LIST_TABLE** + +Description: New table to hold SRv6 SID list. + +Schema: + +``` +; New table +; holds SRv6 SID list + +key = SRV6_SID_LIST_TABLE:segment_name + +; field = value +path = SID, ; List of SIDs +``` + +**New SRV6_MY_SID_TABLE** + +Description: New table to hold local SID to behavior mapping + +Schema: + +``` +; New table +; holds local SID to behavior mapping + +key = SRV6_MY_SID_TABLE:block_len:node_len:func_len:arg_len:ipv6address + +; field = value +action = behavior ; behaviors defined for local SID +vrf = VRF_TABLE.key ; VRF name for END.DT46, can be empty +adj = address, ; List of adjacencies for END.X, can be empty +segment = SRv6_SID_LIST.key, ; List of segment names for END.B6.ENCAP, can be empty +source = address, ; List of src addrs for encap for END.B6.ENCAP +``` + +**Modify ROUTE_TABLE** + +Description: Existing Route Table is extended to add SID list. + +Schema: + +``` +;Stores a list of routes +;Status: Mandatory + +key = ROUTE_TABLE:VRF_NAME:prefix ; +nexthop = prefix, ; IP addresses separated ',' (empty indicates no gateway). May indicate VXLAN endpoint if vni_label is non zero. +intf = ifindex? PORT_TABLE.key ; zero or more separated by ',' (zero indicates no interface) +vni_label = VRF.vni ; zero or more separated by ',' (empty value for non-vxlan next-hops). May carry MPLS label in future. +router_mac = mac_address ; zero or more remote router MAC address separated by ',' (empty value for non-vxlan next-hops) +blackhole = BIT ; Set to 1 if this route is a blackhole (or null0) +segment = SRV6_SID_LIST.key ; New optional field. List of segment names, separated by ',' +seg_src = address ; New optional field. Source addrs for sid encap +``` + +**Two cases:** + +**CASE A :** route entry with the same key(VRF_NAME:prefix ) already exists in APPL_DB ROUTE_TABLE + +**CASE B:** route entry with the same key(VRF_NAME:prefix ) DOES NOT exist in APPL_DB ROUTE_TABLE + +For both cases, we don't care fields **nexthop**, **intf**, **vni_lable**, **route_mac** and **blackhole**, since srv6 related fields will be added which includes segments. Segments actually is lists of sids which tell the packets will be added SRV6 encap header and SID list will be used for nexthop lookup in SRV6Orch. + + + +For Controller, it only needs to below information and update APPL_DB ROUTE_TABLE no matter it exists or not. + +**key**: the key in ROUTE_TABLE is the same as the one in SRV6_STEER_MAP + +**segment**: form SRV6_STEER_MAP entry, the policy field indicates the entry in SRV6_POLICY_TABLE, the segment field information is there. Srv6Orch will use segment to find sid list and sids for nexthop lookup. + +**seg_src**: form SRV6_STEER_MAP entry, the source field indicates what will be used here + + EXAMPLE : how to modify ROUTE_TABLE + current CONFIG_DB: + "SRV6_SID_LIST": { + "seg1": { + "path": [ + "BABA:1001:0:10::", + "BABA:1001:0:20:F1::" + ] + }, + "seg2": { + "path": [ + "BABA:1001:0:30::", + "BABA:1001:0:40:F1::" + ] + } + } + + "SRV6_STEER": { + "Vrf-red|11.11.11.0/24": { + "policy": "policy1", + "source": "A::1" + }, + "Vrf-blue|2001:a::0/64": { + "policy": "policy2", + "source": "A::1" + } + } + + "SRV6_POLICY": { + "policy1": { + "segment": "seg1, seg2" + }, + "policy2": { + "segment": "seg1" + } + } + + current APPL_DB: + "ROUTE_TABLE": { + "Vrf-red:11.11.11.0/24": { + "nexthop" : "109.109.109.109", + "ifname" : "Vlan1001", + "vni_label" : "1001", + "router_mac" : "c6:97:75:ed:06:72" + } + } + + future APPL_DB: + "ROUTE_TABLE": { + "Vrf-red:11.11.11.0/24": { + "nexthop" : "109.109.109.109", + "ifname" : "Vlan1001", + "vni_label" : "1001", + "router_mac" : "c6:97:75:ed:06:72", + + "segment": "seg1,seg2", + "seg_src": "A::1" + } + } + +SRV6_STEER_TABLE generated route entry has higher priority than the entry in ROUTE_TABLE if any matched. Controller will update ROUTE_TABLE entry and modify it in APPL_DB ROUTE_TABLE if any. + +In Srv6Orch, it will mark which route entry is Srv6 modified and having higher priority to do SID and nexthop lookup, FRR or other modules cannot modify these high priority routes, they can only be modified via Srv6Orch. + +**Resolve SID NextHop Via Controller or Others:** + +If the SID subnet (below example, 2000::31 on E31) is directly connected to E11, the nexthop could be found, if not, we should have a controller to indicate nexthop information on E11 for subnet 2000::31, since FRR is not involved at this moment on Phase #1. A static route should be installed via controller in APPL_DB ROUTE_TABLE. Or the network itself has some basic ipv6 protocol is ruuning, and all the basic ipv6 informaion is fully exchanged, it depends on how the architecture is designed. + +Beside adding/modifing routes, controller could delete routes. When controller deletes some routes, then the higher priority flag will be removed and the routes will be deleted. Frr or other modules could modify the routes the same way as we did today when the srv6 high priority flag doesn't exist. + +**An Example as below:** +![draw-configdb](images/Srv6Example.png) + + +## 3.3 Orchestration Agent Changes + +New Orchagent(SRV6Orch) is created to manage all SRV6 related objects. SRV6Orchagent listens to APP_DB for regular updates and create/update SAI objects in ASIC_DB. + +![draw-configdb](images/srv6db.png) + +![draw-configdb](images/srv6orch.png) + +**SRV6Orchagent** + +This orchagent is responsible to create SRV6 related objects in ASIC_DB with the information from APP_DB. + + + +SRV6Orchagent listens to all updates of SRV6_SID_LIST_TABLE to create SAI_SEGMENTROUTE_SIDLIST object with list of V6 prefixes. It also creates a SRV6 Nexthop with the existing SIDLIST object handle. Any update to V6 prefixes to the segment will be pushed to ASIC_DB. + + + +When a route entry is added to ROUTE_TABLE, routeOrchagent calls srv6Orchagent to get the SRV6 nexthop with all associated segment prefixes. If a route entry is referenced by list of ECMP segments, the orchagent creates a ECMP group with already created Nexthop members and pass the ECMP object handle to routeOrchagent. When all route entries referenced by the ECMP groups are deleted, ECMP group object is deleted. + + + +Orchagent listens to SRV6_MY_SID_TABLE in APP_DB to create SAI objects in ASIC_DB. For SRV6_MY_SID_TABLE's END.X action, this orchagent queries the existing IP NextHop and NextHopGroup database and use the existing object handle and update ASIC_DB. When IP NextHop doesn't exist, SRV6_MY_SID_TABLE objects are programmed with Drop action and notify NeighOrch to resolve IP NextHop. When that NextHop is resolved, SRV6Orchagent updates SRV6_MY_SID_TABLE with valid IP NextHop handle and Forward action. This orchagent creates a new ECMP group when Nexthop exists for all the Nexthop addresses in END.X action and no matching group exists in the DB. For SRV6_MY_SID_TABLE's END.DT46 action, orchagent passes the VRF handle associated with VRF name to ASIC_DB. For SRV6_MY_SID_TABLE's END.B6 Encaps, orchagent use existing Nexthop/NexthopGroup for the list of segments or create a new NexthopGroup. + + + +**NextHopKey Changes** + + + +RouteOrch uses NexthopKey to create SAI next hop objects. To support SRV6 segments in the nextHop, key is modified to include segment string and source address string used for SRv6 source encapsulation. + + +``` +Struct NextHopKey { + + IpAddress ip_address; + + ... + string segment; + + string srv6_source; + + ... + +} +``` + + + +## 3.4 SAI + + https://github.com/opencomputeproject/SAI/compare/master...ashutosh-agrawal:srv6 + +SR Source behavior: + +1) Create a SID list object with 3 segments + + sidlist_entry_attrs[0].id = SAI_SEGMENTROUTE_SIDLIST_ATTR_TYPE + + sidlist_entry_attrs01].value.s32 = SAI_SEGMENTROUTE_SIDLIST_TYPE_ENCAPS_RED + + sidlist_entry_attrs[1].id = SAI_SEGMENTROUTE_SIDLIST_ATTR_SEGMENT_LIST + + sidlist_entry_attrs[1].value.objlist.count = 3; + + CONVERT_STR_TO_IPV6(sidlist_entry_attrs[1].value.objlist.list[0], "2001:db8:85a3::8a2e:370:7334"); + + CONVERT_STR_TO_IPV6(sidlist_entry_attrs[1].value.objlist.list[1], "2001:db8:85a3::8a2e:370:2345"); + + CONVERT_STR_TO_IPV6(sidlist_entry_attrs[1].value.objlist.list[2], "2001:db8:85a3::8a2e:370:3456"); + + saistatus = sai_v6sr_api->create_segmentroute_sidlist(&sidlist_id, switch_id, 2, sidlist_entry_attrs); + + + +2) Create Nexthop with the sidlist object + + nexthop_entry_attrs[0].id = SAI_NEXTHOP_ATTR_TYPE + + nexthop_entry_attrs[0].value = SAI_NEXT_HOP_TYPE_SEGMENTROUTE_SIDLIST + + nexthop_entry_attrs[1].id = SAI_NEXTHOP_ATTR_TUNNEL_ID + + nexthop_entry_attrs[1].value.oid = tunnel_id + + nexthop_entry_attrs[2].id = SAI_NEXT_HOP_ATTR_SEGMENTROUTE_SIDLIST_ID + + nexthop_entry_attrs[2].value.oid = sidlist_id + + saistatus = sai_nexthop_api->create_nexthop(&nexthop_id, switch_id, 3, nexthop_entry_attrs) + + + +3) Create route entry with SRv6 Nexthop + + route_entry.switch_id = 0 + + route_entry.vr_id = vr_id_1 // created elsewhere + + route_entry.destination.addr_family = SAI_IP_ADDR_FAMILY_IPV4 + + route_entry.destination.addr.ip4 = "198.51.100.0" + + route_entry.destination.addr.mask = "255.255.255.0" + + route_entry_attrs[0].id = SAI_ROUTE_ENTRY_ATTR_NEXT_HOP_ID; + + route_entry_attrs[0].value.oid = nexthop_id; + + saisstatus = sairoute_api->create_route(&route_entry, 1, route_entry_attrs) + + + +SR TRansit/Endpoint behavior + + + +my_sid_entry.switch_id = 0 + +my_sid_entry.vr_id = vr_id_1 // underlay VRF + +my_sid_entry.locator_len = 64 + +my_sid_entry.function_len = 8 + +CONVERT_STR_TO_IPV6(my_sid_entry.sid, "2001:db8:0:1::1000:0:0:0"); + + + +my_sid_attr[0].id = SAI_MY_SID_ENTRY_ATTR_ENDPOINT_BEHAVIOR + +my_sid_attr[0].value = SAI_MY_SID_ENTRY_ENDPOINT_TYPE_DT46 + +my_sid_attr[1].id = SAI_MY_SID_ENTRY_ATTR_VRF + +my_sid_attr[1].value.oid = vr_id_1001 // overlay vrf, created elsewhere + +saistatus = saiv6sr_api->create_my_sid_entry(&my_sid_entry, 2, my_sid_attr) + + +## 3.5 YANG Model +``` +module: sonic-srv6 + +--rw sonic-srv6 + +--rw SRV6_SID_LIST + | +--rw SRV6_SID_LIST_LIST* [name] + | +--rw name string + | +--rw path* inet:ipv6-address + +--rw SRV6_MY_SID + | +--rw SRV6_MY_SID_LIST* [ip-address] + | +--rw ip-address inet:ipv6-address + | +--rw block_len? uint16 + | +--rw node_len? uint16 + | +--rw func_len? uint16 + | +--rw arg_len? uint16 + | +--rw action? enumeration + | +--rw vrf? -> /vrf:sonic-vrf/VRF/VRF_LIST/name + | +--rw adj* inet:ipv6-address + | +--rw policy? -> /sonic-srv6/SRV6_POLICY/SRV6_POLICY_LIST/name + | +--rw source? inet:ipv6-address + +--rw SRV6_POLICY + | +--rw SRV6_POLICY_LIST* [name] + | +--rw name string + | +--rw segment* -> /sonic-srv6/SRV6_SID_LIST/SRV6_SID_LIST_LIST/name + +--rw SRV6_STEER + +--rw SRV6_STEER_LIST* [vrf-name ip-prefix] + +--rw vrf-name -> /vrf:sonic-vrf/VRF/VRF_LIST/name + +--rw ip-prefix union + +--rw policy? -> /sonic-srv6/SRV6_POLICY/SRV6_POLICY_LIST/name + +--rw source? inet:ipv6-address +``` + +## 4 Unit Test + +TBD + +## 5 References + +- [SAI IPv6 Segment Routing Proposal for SAI 1.2.0](https://github.com/opencomputeproject/SAI/blob/1066c815ddd7b63cb9dbf4d76e06ee742bc0af9b/doc/SAI-Proposal-IPv6_Segment_Routing-1.md) + +- [RFC 8754](https://tools.ietf.org/html/rfc8754) +- [RFC 8986](https://www.rfc-editor.org/rfc/rfc8986.html) +- [draft-filsfils-spring-segment-routing-policy](https://tools.ietf.org/html/draft-filsfils-spring-segment-routing-policy-06) + +- [draft-ali-spring-bfd-sr-policy-06](https://tools.ietf.org/html/draft-ali-spring-bfd-sr-policy-06) + +- [draft-filsfils-spring-net-pgm-extension-srv6-usid](https://tools.ietf.org/html/draft-filsfils-spring-net-pgm-extension-srv6-usid-08) + +- [draft-cl-spring-generalized-srv6-for-cmpr](https://tools.ietf.org/html/draft-cl-spring-generalized-srv6-for-cmpr-02) + + diff --git a/doc/subport/sonic-sub-port-intf-hld.md b/doc/subport/sonic-sub-port-intf-hld.md index 5f33638552..1e0ad3f4bd 100644 --- a/doc/subport/sonic-sub-port-intf-hld.md +++ b/doc/subport/sonic-sub-port-intf-hld.md @@ -9,9 +9,10 @@ * [1 Requirements](#1-requirements) * [2 Schema design](#2-schema-design) * [2.1 Configuration](#21-configuration) - * [2.1.1 config_db.json](#211-config-db-json) - * [2.1.2 CONFIG_DB](#212-config-db) - * [2.1.3 CONFIG_DB schemas](#213-config-db-schemas) + * [2.1.1 Naming Convention for sub-interfaces](#211-naming-convention-for-sub-interfaces) + * [2.1.2 config_db.json](#211-config-db-json) + * [2.1.3 CONFIG_DB](#212-config-db) + * [2.1.4 CONFIG_DB schemas](#213-config-db-schemas) * [2.2 APPL_DB](#22-appl-db) * [2.3 STATE_DB](#23-state-db) * [2.4 SAI](#24-sai) @@ -27,6 +28,7 @@ * [3.1 Sub port interface creation](#31-sub-port-interface-creation) * [3.2 Sub port interface runtime admin status change](#32-sub-port-interface-runtime-admin-status-change) * [3.3 Sub port interface removal](#33-sub-port-interface-removal) + * [3.4 Sub port MTU Configuration](#34-sub-port-mtu-configuration) * [4 CLI](#4-cli) * [4.1 Config commands](#41-config-commands) * [4.1.1 Config a sub port interface](#411-config-a-sub-port-interface) @@ -44,19 +46,23 @@ * [6.3.2 Remove all IP addresses from a sub port interface](#632-remove-all-ip-addresses-from-a-sub-port-interface) * [6.3.3 Remove a sub port interface](#633-remove-a-sub-port-interface) * [7 Scalability](#7-scalability) - * [8 Port channel renaming](#8-port-channel-renaming) + * [8 upgrade and downgrade considerations](#8-upgrade-and-downgrade-considerations) * [9 Appendix](#9-appendix) * [9.1 Difference between a sub port interface and a vlan interface](#91-difference-between-a-sub-port-interface-and-a-vlan-interface) - * [10 Open questions](#10-open-questions) - * [11 Acknowledgment](#11-acknowledgment) - * [12 References](#12-references) + * [10 API Library](#10-api-library) + * [10.1 SWSS CPP Library](#101-swss-cpp-library) + * [10.2 Python Library](#102-python-library) + * [11 Open questions](#11-open-questions) + * [12 Acknowledgment](#12-acknowledgment) + * [13 References](#13-references) # Revision history -| Rev | Date | Author | Change Description | -|:---:|:-----------:|:------------------:|-----------------------------------| -| 0.1 | 07/01/2019 | Wenda Ni | Initial version | +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:------------------:|---------------------------------------------------------| +| 0.1 | 07/01/2019 | Wenda Ni | Initial version | +| 0.2 | 12/17/2020 | Broadcom | Subinterface naming convention changes and enhancements | # Scope A sub port interface is a logical interface that can be created on a physical port or a port channel. @@ -96,8 +102,14 @@ A sub port interface shall support the following features: * VRF * RIF counters * QoS setting inherited from parent physical port or port channel -* mtu inherited from parent physical port or port channel +* MTU: + MTU of the subinterface is inherited from the parent interface (physical or portchannel) + If subinterface MTU is configured, MTU on subinterface will be configured with: + - If Subinterface MTU <= parent port MTU, configured subinterface MTU will be applied. + - If Subinterface MTU > parent port MTU, parent port MTU will be applied. * Per sub port interface admin status config + - Kernel subinterface netdev admin UP can be performed only if parent interface netdev is admin UP. + Hence subinterface admin UP is performed only after parent interface is admin UP. # 2 Schema design @@ -105,19 +117,48 @@ We introduce a new table "VLAN_SUB_INTERFACE" in the CONFIG_DB to host the attri For APPL_DB and STATE_DB, we do not introduce new tables for sub port interfaces, but reuse existing tables to host sub port interface keys. ## 2.1 Configuration -### 2.1.1 config_db.json +### 2.1.1 Naming Convention for sub-interfaces: + +Since Kernel has netdevice name length restriction to 15, Physical sub-interfaces(in case interface number > 99) and port channel sub-interfaces cannot follow the same nomenclature as physical interfaces. +Hence short name convention needs to be supported for subinterfaces. + +All DB & kernel netdevice corresponding to the sub-interface will be created based on user configuration. +- If user configures subinterfaces in short name format, all DB & kernel netdevices will be created in short name format. +- If user configures subinterfaces in existing long name format, all DB & netdevices will be created with existing long name format. + +Short naming conventions for sub-interfaces will have Ethxxx.yyyy, Poxxx.yyyy format. +Long naming conventions for sub-interfaces will have Ethernetxx.yyyy. +Physical subinterfaces on interface number exceeding 2 digits and PortChannel subinterfaces in long name format were not supported earlier and will NOT be supported due to name length restriction. + +Intfmgrd & IntfsOrch which manages sub-interfaces should be aware of this mapping to get parent interface properties. + +SWSS CPP library & Click Python API library will be provided to perform short name to long name conversion and vice versa. +Please refer to the API library section for details. + +All click config CLIs for sub-interfaces will be enhanced to accept both long name & short name format for subinterfaces. + +### 2.1.2 config_db.json ``` "VLAN_SUB_INTERFACE": { - "{{ port_name }}.{{ vlan_id }}": { - "admin_status" : "{{ adminstatus }}" + "{{ port_name }}.{{ subinterface_id }}": { + "vlan" : <1-4094>, + "admin_status" : "{{ adminstatus }}", + "vrf_name" : }, - "{{ port_name }}.{{ vlan_id }}|{{ ip_prefix }}": {} + "{{ port_name }}.{{ subinterface_id }}|{{ ip_prefix }}": {} }, ``` A key in the VLAN_SUB_INTERFACE table is the name of a sub port, which consists of two sections delimited by a "." (symbol dot). -The section before the dot is the name of the parent physical port or port channel. The section after the dot is the dot1q encapsulation vlan id. +The section before the dot is the name of the parent physical port or port channel. The section after the dot is a unique number which uniquely identifies the sub-interface on the parent interface. +Sub-interface id value represents vlan id in long name format. +Sub-interface id value in short name format uniqeuly identifies subinterface under the parent interface. It can be in range 1-99999999(Subinterface ID cannot exceed 8 digits). + +vlan field is applicable only for short name format subinterfaces. +vlan field identifies the vlan to which the sub-interface is associated using .1Q trunking. +Note that subinterface_id and vlan_id for a subinterface can be different in short name format. -mtu of a sub port interface is inherited from its parent physical port or port channel, and is not configurable in the current design. +In Click CLI, user will be able to configure the vlan id associated with the sub-interface in short name format. +In existing long name format Sub-interface id is used as vlan id. admin_status of a sub port interface can be either up or down. In the case field "admin_status" is absent in the config_db.json file, a sub port interface is set admin status up by default at its creation. @@ -125,34 +166,48 @@ In the case field "admin_status" is absent in the config_db.json file, a sub por Example configuration: ``` "VLAN_SUB_INTERFACE": { - "Ethernet64.10": { + "Ethernet0.100": { "admin_status" : "up" }, - "Ethernet64.10|192.168.0.1/21": {}, - "Ethernet64.10|fc00::/7": {} + "Ethernet0.100|192.0.0.1/21": {}, + "Ethernet0.100|fc0a::/112": {} + "Eth64.10": { + “vlan” : 100, + "admin_status" : "up" + }, + "Eth64.10|192.168.0.1/21": {}, + "Eth64.10|fc00::/7": {} }, ``` -### 2.1.2 CONFIG_DB +### 2.1.3 CONFIG_DB ``` -VLAN_SUB_INTERFACE|{{ port_name }}.{{ vlan_id }} +VLAN_SUB_INTERFACE|{{ port_name }}.{{ subinterface_id }} + "vlan" : "{{ vlan-id }}" "admin_status" : "{{ adminstatus }}" -VLAN_SUB_INTERFACE|{{ port_name }}.{{ vlan_id }}|{{ ip_prefix }} +VLAN_SUB_INTERFACE|{{ port_name }}.{{ subinterface_id }}|{{ ip_prefix }} "NULL" : "NULL" ``` -### 2.1.3 CONFIG_DB schemas +### 2.1.4 CONFIG_DB schemas ``` ; Defines for sub port interface configuration attributes key = VLAN_SUB_INTERFACE|subif_name ; subif_name is the name of the sub port interface ; subif_name annotations -subif_name = port_name "." vlan_id ; port_name is the name of parent physical port or port channel - ; vlanid is DIGIT 1-4094 +subif_name = port_name "." subinterface_id ; port_name is the name of parent physical port or port channel + ; In short name format subinterface_id is DIGIT 1-99999999 + ; In long name format subinterface_id is vlan id. ; field = value admin_status = up / down ; admin status of the sub port interface + +; field = value +vlan = <1-4094> ; Vlan id in range <1-4094> + +; field = value +vrf_name = ; Name of the Vrf ``` ``` @@ -183,24 +238,40 @@ ls32 = ( h16 ":" h16 ) / IPv4address Example: ``` -VLAN_SUB_INTERFACE|Ethernet64.10 +VLAN_SUB_INTERFACE|Ethernet0.100 "admin_status" : "up" -VLAN_SUB_INTERFACE|Ethernet64.10|192.168.0.1/21 +VLAN_SUB_INTERFACE|Ethernet0.100|192.0.0.1/21 "NULL" : "NULL" -VLAN_SUB_INTERFACE|Ethernet64.10|fc00::/7 +VLAN_SUB_INTERFACE|Ethernet0.100|fc0a::/112 + "NULL" : "NULL" + +VLAN_SUB_INTERFACE|Eth64.10 + "vlan" : 100, + "admin_status" : "up" + +VLAN_SUB_INTERFACE|Eth64.10|192.168.0.1/21 + "NULL" : "NULL" + +VLAN_SUB_INTERFACE|Eth64.10|fc00::/7 "NULL" : "NULL" ``` ## 2.2 APPL_DB ``` -INTF_TABLE:{{ port_name }}.{{ vlan_id }} +INTF_TABLE:{{ port_name }}.{{ subinterface_id }} + "vlan" : "{{ vlan id }}" "admin_status" : "{{ adminstatus }}" ; field = value admin_status = up / down ; admin status of the sub port interface +; field = value +vlan = <1-4094> ; Vlan id in range <1-4094> + +; field = value +vrf_name = ; Name of the Vrf INTF_TABLE:{{ port_name }}.{{ vlan_id }}:{{ ip_prefix }} "scope" : "{{ visibility_scope }}" @@ -213,14 +284,26 @@ family = IPv4 / IPv6 ; address family Example: ``` -INTF_TABLE:Ethernet64.10 +INTF_TABLE:Ethernet0.100 + "admin_status" : "up" + +INTF_TABLE:Ethernet0.100:192.0.0.1/24 + "scope" : "global" + "family": "IPv4" + +INTF_TABLE:Ethernet0.100:fc0a::/112 + "scope" : "global" + "family": "IPv6" + +INTF_TABLE:Eth64.10 + "vlan" : 100 "admin_status" : "up" -INTF_TABLE:Ethernet64.10:192.168.0.1/24 +INTF_TABLE:Eth64.10:192.168.0.1/24 "scope" : "global" "family": "IPv4" -INTF_TABLE:Ethernet64.10:fc00::/7 +INTF_TABLE:Eth64.10:fc00::/7 "scope" : "global" "family": "IPv6" ``` @@ -229,29 +312,42 @@ INTF_TABLE:Ethernet64.10:fc00::/7 Following the current schema, sub port interface state of a physical port is set to the PORT_TABLE, while sub port interface state of a port channel is set to the LAG_TABLE. ``` -PORT_TABLE|{{ port_name }}.{{ vlan_id }} +PORT_TABLE|{{ port_name }}.{{ subinterface_id }} "state" : "ok" ``` ``` -LAG_TABLE|{{ port_name }}.{{ vlan_id }} +LAG_TABLE|{{ port_name }}.{{ subinterface_id }} "state" : "ok" ``` ``` -INTERFACE_TABLE|{{ port_name }}.{{ vlan_id }}|{{ ip_prefix }} +INTERFACE_TABLE|{{ port_name }}.{{ subinterface_id }}|{{ ip_prefix }} "state" : "ok" ``` Example: ``` -PORT_TABLE|Ethernet64.10 +PORT_TABLE|Ethernet0.100 + "state" : "ok" +``` +``` +INTERFACE_TABLE|Ethernet0.100|192.0.0.1/21 + "state" : "ok" +``` +``` +INTERFACE_TABLE|Ethernet0.100|fc0a::/112 + "state" : "ok" +``` + +``` +PORT_TABLE|Eth64.10 "state" : "ok" ``` ``` -INTERFACE_TABLE|Ethernet64.10|192.168.0.1/21 +INTERFACE_TABLE|Eth64.10|192.168.0.1/21 "state" : "ok" ``` ``` -INTERFACE_TABLE|Ethernet64.10|fc00::/7 +INTERFACE_TABLE|Eth64.10|fc00::/7 "state" : "ok" ``` @@ -335,23 +431,28 @@ sai_status_t status = remove_router_interface(rif_id); Inside SONiC, we use iproute2 package to manage host sub port interfaces. Specifically, we use `ip link add link name type vlan id ` to create a host sub port interface. -This command implies the dependancy that a parent host interface must be created before the creation of a host sub port interface. +This command implies the dependency that a parent host interface must be created before the creation of a host sub port interface. Example: ``` -ip link add link Ethernet64 name Ethernet64.10 type vlan id 10 -ip link set Ethernet64.10 mtu 9100 -ip link set Ethernet64.10 up +ip link add link Ethernet0 name Ethernet0.100 type vlan id 100 +ip link set Ethernet0.100 mtu 9100 +ip link set Ethernet0.100 up +ip link add link Ethernet64 name Eth64.10 type vlan id 100 +ip link set Eth64.10 mtu 9100 +ip link set Eth64.10 up ``` ``` -ip link del Ethernet64.10 +ip link del Ethernet0.100 +ip link del Eth64.10 ``` We use `ip address` and `ip -6 address` to add and remove ip adresses on a host sub port interface. Example: ``` -ip address add 192.168.0.1/24 dev Ethernet64.10 +ip address add 192.0.0.1/24 dev Ethernet0.100 +ip address add 192.168.0.1/24 dev Eth64.10 ``` Please note that the use of iproute2 package is internal to SONiC, specifically IntfMgrd. @@ -369,14 +470,42 @@ Internally, a sub port interface is represented as a Port object to be perceived # 3 Event flow diagrams ## 3.1 Sub port interface creation -![](sub_intf_creation_flow.png) +![](sub_intf_creation_flow_version_2.png) + +* Field vlan added to config_db carries vlan id associated to the subinterface. +* Sub Interface will be created and treated ready only if vlan corresponding to subinterface is configured. ## 3.2 Sub port interface runtime admin status change -![](sub_intf_set_admin_status_flow.png) +![](sub_intf_set_admin_status_flow_version_2.png) + +Admin status of the subinterface is tied to its parent interface admin status: +* Kernel does not allow subinterface netdev UP until its parent netdev is UP. +* IntfMgrd looks up the admin status of parent interface from STATE_DB|PORT_TABLE. + - OP: admin UP of subinterface: If Parent interface is admin UP, subinterface admin UP is performed. + - OP: admin down of subinterface: No dependency on parent interface admin status. Subinterface admin down performed. +* IntfMgrd also subscribes to STATE_DB|PORT_TABLE and APPL_DB|LAG_TABLE for parent interface admin status change to update associated subinterface admin status. ## 3.3 Sub port interface removal ![](sub_intf_removal_flow.png) +## 3.4 Sub port MTU Configuration +![](sub_intf_set_mtu_flow_version_2.png) + +MTU on subinterface has dependency on MTU configured on parent interface. + +* Kernel does not allow subinterface netdev MTU to exceed its parent netdev MTU. +* By default kernel inherits subinterface netdev MTU from parent netdev. +* If Parent netdev MTU is updated to lower value than any of its subinterface netdev MTU, kernel updates subinterface netdev MTU to parent netdev MTU. But, kernel does NOT restore previous subinterface MTU if parent netdev MTU is configured > subinterface MTU. + +To solve above dependency: + +* Whenever MTU is updated on subinterface + - If configured MTU <= Parent MTU, update subinterface MTU. + - If configured MTU > Parent interface MTU, do not update subinterface MTU and cache the configured MTU. +* IntfMgrd subscribes to STATE_DB|PORT_TABLE & APPL_DB|LAG_TABLE. + - If Parent interface MTU is changed to < subinterface MTU, APPL_DB|INTF_TABLE for subinterface is updated to parent interface MTU. + - If Parent interface MTU is changed to > subinterface MTU, update subinterface MTU to user configured subinterface MTU. + # 4 CLIs ## 4.1 Config commands ### 4.1.1 Config a sub port interface @@ -409,7 +538,7 @@ Commands: del Remove a sub port interface ``` ``` -Usage: config subinterface add +Usage: config subinterface add [vlan <1-4094>] ``` ``` Usage: config subinterface del @@ -477,7 +606,8 @@ Example: ``` Sub port interface Speed MTU Vlan Admin Type ------------------ ------- ----- ------ ------- ------------------- - Ethernet64.10 100G 9100 10 up dot1q-encapsulation + Eth64.10 100G 9100 100 up dot1q-encapsulation + Ethernet0.100 100G 9100 100 up dot1q-encapsulation ``` No operational status is defined on RIF (sub port interface being a type of RIF) in SAI spec. @@ -551,10 +681,8 @@ We enforce a minimum scalability requirement on the number of sub port interface | Number of sub port interfaces per physical port or port channel | 250 | | Number of sub port interfaces per switch | 750 | -# 8 Port channel renaming -Linux has the limitation of 15 characters on an interface name. -For sub port interface use cases on port channels, we need to redesign the current naming convention for port channels (PortChannelXXXX, 15 characters) to take shorter names (such as, PoXXXX, 6 characters). -Even when the parent port is a physical port, sub port interface use cases, such as Ethernet128.1024, still exceed the 15-character limit on an interface name. +# 8 Upgrade and Downgrade considerations +Since subinterface are supported in existing long name CONFIG_DB format, Upgrade and downgrade will be seamless with no impact to subinterface functionality. # 9 Appendix ## 9.1 Difference between a sub port interface and a vlan interface @@ -564,7 +692,48 @@ Vlan interface is a router interface (RIF type vlan Vlan#) facing a .1Q bridge. ![](vlan_intf_rif.png "Fig. 3: Vlan interface") __Fig. 3: Vlan interface__ -# 10 Open questions: +# 10 API Library +All DB & Kernel netdev corresponding to the subinterface can be created with short name & existing long name format. +Intfmgrd & IntfsOrch which manages sub-interfaces should be able to fetch parent interface properties for a given subinterface. + +## 10.1 SWSS CPP Library +In CPP, applications can use subintf class provided by sonic-swss library to fetch attributes of subinterface. + +Subintf class provides below methods: + +1. isValid() +This method returns true if the subinterface is valid. +Subinterface will be considered valid if it follows format Ethxxx.yyyy, Poxxx.yyyy & Ethernetxx.yyyy. + +2. subIntfIdx() +Returns a subinterface index as an integer type. + +3. longName() +Returns subinterface name in longname format. + +4. shortName() +Returns subinterface name in shortname format. + +5. parentIntfLongName() +Returns parent interface name in longname format. + +6. parentIntfShortName() +Returns parent interface in shortname format. + + +## 10.2 Python Library +In Python, applications can use interface library in utilities_common to perform conversion to longname or shortname. + +1. intf_get_longname() +Returns interface in longname format. +It returns a longname format for both subinterface and parent interface depending on what argument is being passed. + +2. intf_get_shortname() +Returns interface in shortname format. +It returns a shortname format for both subinterface and parent interface depending on what argument is being passed. + + +# 11 Open questions: 1. Miss policy to be defined in SAI specification When a 802.1q tagged packet is received on a physical port or a port channel, it will go to the sub port interface that matches the VLAN id inside the packet. @@ -573,10 +742,10 @@ __Fig. 3: Vlan interface__ As shown in Fig. 1, there is possiblity that a physical port or a port channel may not have a RIF type port created. In this case, if an untagged packet is received on the physical port or port channel, what is the policy on handling the untagged packet? -# 11 Acknowledgment +# 12 Acknowledgment Wenda would like to thank his colleagues with Microsoft SONiC team, Shuotian, Prince, Pavel, and Qi in particular, Itai with Mellanox for all discussions that shape the design proposal, and community members for comments and feedbacks that improve the design. -# 12 References +# 13 References [1] SAI_Proposal_Bridge_port_v0.9.docx https://github.com/opencomputeproject/SAI/blob/master/doc/bridge/SAI_Proposal_Bridge_port_v0.9.docx [2] Remove the need to create an object id for vlan in creating a sub port router interface https://github.com/opencomputeproject/SAI/pull/998 diff --git a/doc/subport/sub_intf_creation_flow_version_2.png b/doc/subport/sub_intf_creation_flow_version_2.png new file mode 100644 index 0000000000..8b9f252379 Binary files /dev/null and b/doc/subport/sub_intf_creation_flow_version_2.png differ diff --git a/doc/subport/sub_intf_set_admin_status_flow_version_2.png b/doc/subport/sub_intf_set_admin_status_flow_version_2.png new file mode 100644 index 0000000000..989d284608 Binary files /dev/null and b/doc/subport/sub_intf_set_admin_status_flow_version_2.png differ diff --git a/doc/subport/sub_intf_set_mtu_flow_version_2.png b/doc/subport/sub_intf_set_mtu_flow_version_2.png new file mode 100644 index 0000000000..d9e79f01b4 Binary files /dev/null and b/doc/subport/sub_intf_set_mtu_flow_version_2.png differ diff --git a/doc/system_health_monitoring/system-health-HLD.md b/doc/system_health_monitoring/system-health-HLD.md index 9b534af838..25b11be2f6 100644 --- a/doc/system_health_monitoring/system-health-HLD.md +++ b/doc/system_health_monitoring/system-health-HLD.md @@ -5,69 +5,64 @@ | Rev | Date | Author | Change Description | |:---:|:-----------:|:------------------:|-----------------------------------| | 0.1 | | Kebo Liu | Initial version | - + | 0.2 | | Junchao Chen | Check service status without monit| ## 1. Overview of the system health monitor -System health monitor is intended to monitor both critical services and peripheral device status and leverage system log, system status LED to and CLI command output to indicate the system status. - -In current SONiC implementation, already have Monit which is monitoring the critical services status and also have a set of daemons(psud, thermaltcld, etc.) inside PMON collecting the peripheral devices status. - -System health monitoring service will not monitor the critical services or devices directly, it will reuse the result of Monit and PMON daemons to summary the current status and decide the color of the system health LED. - -### 1.1 Services under Monit monitoring - -For the Monit, now below services and file system is under monitoring: - - admin@sonic# monit summary -B - Monit 5.20.0 uptime: 1h 6m - Service Name Status Type - sonic Running System - rsyslog Running Process - telemetry Running Process - dialout_client Running Process - syncd Running Process - orchagent Running Process - portsyncd Running Process - neighsyncd Running Process - vrfmgrd Running Process - vlanmgrd Running Process - intfmgrd Running Process - portmgrd Running Process - buffermgrd Running Process - nbrmgrd Running Process - vxlanmgrd Running Process - snmpd Running Process - snmp_subagent Running Process - sflowmgrd Running Process - lldpd_monitor Running Process - lldp_syncd Running Process - lldpmgrd Running Process - redis_server Running Process - zebra Running Process - fpmsyncd Running Process - bgpd Running Process - staticd Running Process - bgpcfgd Running Process - root-overlay Accessible Filesystem - var-log Accessible Filesystem - - -By default any above services or file systems is not in good status will be considered as fault condition. - -### 1.2 Peripheral devices status which could impact the system health status +System health monitor is intended to monitor both critical services/processes and peripheral device status and leverage system log, system status LED to and CLI command output to indicate the system status. + +In current SONiC implementation, Monit service can monitor the file system as well as customized script status, system health monitor can rely on Monit service to monitor these items. There are also a set of daemons such as psud, thermaltcld inside PMON to collect the peripheral devices status. + +System health monitor needs to monitor the critical service/processes status and borrow the result of Monit service/PMON daemons to summarize the current status and decide the color of the system health LED. + +### 1.1 Monitor critical services/processes + +#### 1.1.1 Monitor critical services + +1. Read FEATURE table in CONFIG_DB, any service whose "STATE" field was configured with "enabled" or "always_enabled" is expected to run in the system +2. Get running services via docker tool (Use python docker library to get running containers) +3. Compare result of #1 and result of #2, any difference will be considered as fault condition + +#### 1.1.2 Monitor critical processes + +1. Read FEATURE table in CONFIG_DB, any service whose "STATE" field was configured with "enabled" or "always_enabled" is expected to run in the system +2. Get critical processes of each running service by reading file /etc/supervisor/critical_processes (Use `docker inspect --format "{{.GraphDriver.Data.MergedDir}}"` to get base director for a container) +3. For each container, use "supervisorctl status" to get its critical process status, any critical process is not in "RUNNING" status will be considered as fault condition. + +### 1.2 Services under Monit monitoring + +For the Monit, now below programs and file systems are under monitoring: + +``` +admin@sonic:~$ sudo monit summary -B +Monit 5.20.0 uptime: 22h 56m + Service Name Status Type + sonic Running System + rsyslog Running Process + root-overlay Accessible Filesystem + var-log Accessible Filesystem + routeCheck Status ok Program + diskCheck Status ok Program + container_checker Status ok Program + vnetRouteCheck Status ok Program + container_memory_telemetry Status ok Program +``` + +By default any service is not in expected status will be considered as fault condition. + +### 1.3 Peripheral devices status which could impact the system health status - Any fan is missing/broken -- Fan speed is below minimal range +- Fan speed is lower than minimal value - PSU power voltage is out of range -- PSU temperature is too hot +- PSU temperature is higher than threshold - PSU is in bad status -- ASIC temperature is too hot +- ASIC temperature is higher than threshold -### 1.3 Customization of monitored critical services and devices +### 1.4 Customization of monitored critical services and devices -#### 1.3.1 Ignore some of monitored critical services and devices +#### 1.4.1 Ignore some of monitored critical services and devices The list of monitored critical services and devices can be customized by a configuration file, the user can rule out some services or device sensors status from the monitor list. System health monitor will load this configuration file at next run and ignore the services or devices during the routine check. ```json { @@ -91,12 +86,12 @@ The filter string is case sensitive. Currently, it support following filters: - .temperature: ignore temperature check for a specific PSU - .voltage: ignore voltage check for a specific PSU -The default filter is to filter nothing. Unknown filters will be silently ignored. The "serivces_to_ignore" and "devices_to_ignore" section must be an string array or it will use default filter. +The default filter is to filter nothing. Unknown filters will be silently ignored. The "services_to_ignore" and "devices_to_ignore" section must be an string array or it will use default filter. This configuration file will be platform specific and shall be added to the platform folder(/usr/share/sonic/device/{platform_name}/system_health_monitoring_config.json). -#### 1.3.2 Extend the monitoring with adding user specific program to Monit -Monit support to check program(scripts) exit status, if user want to monitor something that beyond critical serives or some special device not included in the above list, they can provide a specific scripts and add it to Monit check list, then the result can also be collected by the system health monitor. It requires 2 steps to add an external checker. +#### 1.4.2 Extend the monitoring with adding user specific program to monitor +Monit supports to check program(scripts) exit status, if user wants to monitor something that beyond critical services or some special device not included in the above list, they can provide specific scripts and add them to Monit checking list. Then the result can also be collected by the system health monitor. It requires two steps to add an external checker. 1. Prepare program whose command line output must qualify: @@ -130,9 +125,9 @@ The configuration shall be: } ``` -### 1.4 system status LED color definition +### 1.5 system status LED color definition -default system status LED color definition is like +default system status LED color definition is like | Color | Status | Description | |:----------------:|:-------------:|:-----------------------:| @@ -153,27 +148,30 @@ Considering that different vendors platform may have different LED color capabil } ``` +The field "booting" is deprecated because there is no booting stage anymore. For backward compatible, user can still configure this field but it won't take effect. + ## 2. System health monitor service business logic -System health monitor daemon will running on the host, periodically(every 60s) check the "monit summary" command output and PSU, fan, thermal status which stored in the state DB, if anything wrong with the services monitored by monit or peripheral devices, system status LED will be set to fault status. When fault condition relieved, system status will be set to normal status. +System health monitor daemon will run on the host, and periodically (every 60 seconds) check critical services, processes status, output of the command "monit summary", PSU, Fan, and thermal status which is stored in the state DB. If anything is abnormal, system status LED will be set to fault status. When fault condition relieved, system status will be set to normal status. -Before the switch boot up finish, the system health monitoring service shall be able to know the switch is in boot up status(see open question 1). +System health service shall start after database.service and updategraph.service. Monit service has a default 300 seconds start delay, system health service shall not wait for Monit service as Monit service only monitors part of the system. But system health service shall treat system as "Not OK" until Monit service start to work. -If monit service is not avalaible, will consider system in fault condition. -FAN/PSU/ASIC data not available will also considered as fault conditon. +Empty FEATURE table will be considered as fault condition. +A service whose critical_processes file cannot be parsed will be considered as fault condition. Empty or absence of critical_processes file is not a fault condition and shall be skipped. +If Monit service is not running or in dead state, the system will be considered in fault condition. +If FAN/PSU/ASIC data is not available, this will be considered as fault condition. Incomplete data in the DB will also be considered as fault condition, e.g., PSU voltage data is there but threshold data not available. Monit, thermalctld and psud will raise syslog when fault condition encountered, so system health monitor will only generate some general syslog on these situation to avoid redundant. For example, when fault condition meet, "system health status change to fault" can be print out, "system health status change to normal" when it recovered. -this service will be started after system boot up(after database.service and updategraph.service). ## 3. System health data in redis database System health service will populate system health data to STATE db. A new table "SYSTEM_HEALTH_INFO" will be created to STATE db. ; Defines information for a system health - key = SYSTEM_HEALTH_INFO ; health information for the switch + key = SYSTEM_HEALTH_INFO ; health information for the switch ; field = value summary = STRING ; summary status for the switch = STRING ; an entry for a service or device @@ -244,7 +242,7 @@ Add a new "show system-health" command line to the system system-health Show system health status ... -"show system-health" CLI has three sub command, "summary" and "detail" and "monitor-list". With command "summary" will give brief outpt of system health status while "detail" will be more verbose. +"show system-health" CLI has three sub command, "summary" and "detail" and "monitor-list". With command "summary" will give brief output of system health status while "detail" will be more verbose. "monitor-list" command will list all the services and devices under monitoring. admin@sonic# show system-health ? @@ -281,7 +279,7 @@ When something is wrong for the "detail" sub command output, it will give out all the services and devices status which is under monitoring, and also the ignored service/device list will also be displayed. -"moniter-list" will give a name list of services and devices exclude the ones in the ignore list. +"monitor-list" will give a name list of services and devices exclude the ones in the ignore list. When the CLI been called, it will directly analyze the "monit summary" output and the state DB entries to present a summary about the system health status. The status analyze logic of the CLI shall be aligned/shared with the logic in the system health service. @@ -300,20 +298,8 @@ Fault condition and CLI output string table | FAN data is not available in the DB|FAN data is not available| | ASIC data is not available in the DB|ASIC data is not available| -See open question 2 for adding configuration CLIs. - ## 6. System health monitor test plan 1. If some critical service missed, check the CLI output, the LED color and error shall be as expected. 2. Simulate PSU/FAN/ASIC and related sensor failure via mock sysfs and check the CLI output, the LED color and error shall be as expected. -3. Change the monitor service/device list then check whether the system health monitor service works as expected; also check whether the result of "show system-health monitor-list" aligned. - -## 7. Open Questions - -1. How to determine the SONiC system is in boot up stage? The current design is to compare the system up time with a "boot_timeout" value. The system up time is got from "cat /proc/uptime". The default "boot_timeout" is 300 seconds and can be configured by configuration. System health service will not do any check until SONiC system finish booting. - -```json -{ - "boot_timeout": 300 -} -``` +3. Change the monitor service/device list then check whether the system health monitor service works as expected; also check whether the result of "show system-health monitor-list" aligned. diff --git a/doc/vxlan/EVPN/EVPN_VXLAN_HLD.md b/doc/vxlan/EVPN/EVPN_VXLAN_HLD.md index 64db15f155..3074b03a28 100644 --- a/doc/vxlan/EVPN/EVPN_VXLAN_HLD.md +++ b/doc/vxlan/EVPN/EVPN_VXLAN_HLD.md @@ -2,7 +2,7 @@ # EVPN VXLAN HLD -#### Rev 0.9 +#### Rev 1.0 # Table of Contents @@ -28,7 +28,11 @@ - [COUNTER_DB](#counter_db-changes) - [4.3 Modules Design and Flows](#43-modules-design-and-flows) - [4.3.1 Tunnel Creation](#431-tunnel-auto-discovery-and-creation) + - [4.3.1.1 P2P Tunnel Creation](#4311-p2p-tunnel-creation) + - [4.3.1.2 P2MP Tunnel Creation](#4312-p2mp-tunnel-creation) - [4.3.2 Tunnel Deletion](#432-tunnel-deletion) + - [4.3.2.1 P2P Tunnel Deletion](#4321-p2p-tunnel-deletion) + - [4.3.2.2 P2MP Tunnel Deletion](#4322-p2mp-tunnel-deletion) - [4.3.3 Mapper Handling](#433-per-tunnel-mapper-handling) - [4.3.4 VXLAN State DB Changes](#434-vxlan-state-db-changes) - [4.3.5 Tunnel ECMP](#435-support-for-tunnel-ecmp) @@ -69,6 +73,7 @@ | 0.7 | | Rajesh Sankaran | Click and SONiC CLI added | | 0.8 | | Hasan Naqvi | Linux kernel section and fdbsyncd testcases added | | 0.9 | | Nikhil Kelhapure | Warm Reboot Section added | +| 1.0 | | Sudharsan D.G | Using P2MP Tunnel for Layer2 functionality | # Definition/Abbreviation @@ -87,7 +92,8 @@ | VRF | Virtual Routing and Forwarding | | VTEP | VXLAN Tunnel End point | | VXLAN | Virtual Extended LAN | - +| P2P | Point to Point Tunnel | +| P2MP | Point to MultiPoint Tunnel | # About this Manual This document provides general information about the EVPN VXLAN feature implementation based on RFC 7432 and 8365 in SONiC. @@ -623,6 +629,9 @@ In the current implementation, Tunnel Creation handling in the VxlanMgr and Vxla The VTEP is represented by a VxlanTunnel Object created as above with the DIP as 0.0.0.0 and SAI object type as TUNNEL. This SAI object is P2MP. +Some vendors support P2P Tunnels to handle Layer2 extension and fdb learning while some vendors support using existing P2MP for handling Layer2 scenarios. The difference between the two approaches is the way in which the remote end point flooding is done. In P2P tunnel based approach, for every end point discovered from IMET a P2P tunnel object is created in the hardware and the bridge port created with this tunnel object is added as a VLAN member to the VLAN. In P2MP tunnel based approach, when an IMET route is received the remote end point along with local P2MP tunnel bridge port is added as L2MC group member along for the L2MC group associated with the VLAN. In order to handle both scenarios, evpn_remote_vni orch which currently handles remote VNI is split into two types - evpn_remote_vni_p2p to handle the flow involving the P2P tunnel creation and evpn_remote_vni_p2mp to handle the flow for using the existing P2MP tunnel. The decision to chose which orch to use is dependent on the SAI enum query capability for the attribute SAI_TUNNEL_ATTR_PEER_MODE. If the vendors have SAI_TUNNEL_PEER_MODE_P2P listed, then evpn_remote_vni_p2p orch will be used, else evpn_remote_vni_p2mp will be used. These enhancements abstract the two different modes that can be used to program the SAI. For an external user, there will be no changes from usability perspective since the schema is unchanged. + +#### 4.3.1.1 P2P Tunnel creation In this feature enhancement, the following events result in remote VTEP discovery and trigger tunnel creation. These tunnels are referred to as dynamic tunnels and are P2P. - IMET route rx @@ -643,10 +652,15 @@ For every dynamic tunnel discovered, the following processing occurs. The creation sequence assuming only IMET rx is depicted in the diagram below. ![Tunnel Creation](images/tunnelcreate.PNG "Figure : Tunnel Creation") -__Figure 5: EVPN Tunnel Creation__ +__Figure 5.1: EVPN P2P Tunnel Creation__ -### 4.3.2 Tunnel Deletion +#### 4.3.1.2 P2MP Tunnel Creation +In the current implementation P2MP tunnel creation flow exist with the exception of a bridgeport not created for P2MP tunnel. To support using P2MP tunnel for L2 purposes a bridge port is created for the P2MP tunnel object. +![P2MP Tunnel Creation](images/p2mptunnelcreate.jpg "Figure : P2MP Tunnel Creation") +__Figure 5.2: EVPN P2MP Tunnel Creation__ +### 4.3.2 Tunnel Deletion +#### 4.3.2.1 P2P Tunnel Deletion EVPN Tunnel Deletion happens when the refcnt goes down to zero. So depending on the last route being deleted (IMET, MAC or IP prefix) the tunnel is deleted. sai_tunnel_api remove calls are incompletely handled in the current implementation. @@ -656,6 +670,9 @@ The following will be added as part of tunnel deletion. - sai_tunnel_remove_map, sai_tunnel_remove_tunnel_termination, sai_tunnel_remove_tunnel when the tunnel is to be removed on account of the last entry being removed. - VxlanTunnel object will be deleted. +#### 4.3.2.2 P2MP Tunnel Deletion +In case of P2MP tunnels, the flow is same as the existing flow where the tunnel is deleted after last vxlan-vni map or vrf-vni map is deleted. Additionally before the tunnel deletion, the bridge port created is deleted. + ### 4.3.3 Per Tunnel Mapper handling The SAI Tunnel interface requires encap and decap mapper id to be specified along with every sai tunnel create call. @@ -698,6 +715,7 @@ It is proposed to handle these variances in the SAI implementation. ### 4.3.6 IMET route handling +#### 4.3.6.1 P2P Tunnel Vlan extension The IMET route is used in EVPN to specify how BUM traffic is to be handled. This feature enhancement supports only ingress replication as the method to originate BUM traffic. The VLAN, Remote IP and VNI to be used is encoded in the IMET route. @@ -707,7 +725,15 @@ The VLAN, Remote IP and VNI to be used is encoded in the IMET route. The IMET rx processing sequence is depicted in the diagram below. ![Vlan extension](images/vlanextend.PNG "Figure : VLAN Extension") -__Figure 6: IMET route processing VLAN extension__ +__Figure 6.1: IMET route processing P2P Tunnel VLAN extension__ + +#### 4.3.6.2 P2MP Tunnel Vlan extension + +Similar to P2P tunnel scenario, the feature supports only the ingress replication. However the remote end points are added to VLAN as follows. In SONiC VLAN is created currently using SAI_VLAN_FLOOD_CONTROL_TYPE_ALL(default). To support flooding in P2MP based tunnels, the VLAN's flood control type is set to SAI_VLAN_FLOOD_CONTROL_TYPE_COMBINED which would support flooding to local ports as well as an additional multicast group. When type 2 prefixs are received, the remote end points are added to VLAN by creating a L2MC group and setting it to VLAN created in combined mode, and adding one L2MC group member per remote end point as shown in the flow below + +![P2MP Vlan extension](images/p2mpvlanextension.jpg "Figure : P2MP VLAN Extension") +__Figure 6.2: IMET route processing P2MP TunnelVLAN extension__ + ##### FRR processing When remote IMET route is received, fdbsyncd will install entry in REMOTE_VNI_TABLE in APP_DB: @@ -1078,10 +1104,20 @@ Linux kernel version 4.9.x used in SONiC requires backport of a few patches to s | Vrf-1 | 104 | +-------+-------+ Total count : 1 - + 4. show vxlan tunnel + +-----------------------+---------------+------------------+------------------+---------------------------------+ + | vxlan tunnel name | source ip | destination ip | tunnel map name | tunnel map mapping(vni -> vlan) | + +=======================+===============+==================+==================+=================================+ + | Vtep1 | 4.4.4.4 | | map_50_Vlan5 | 50 -> 5 | + +-----------------------+---------------+------------------+------------------+---------------------------------+ + | Vtep1 | 4.4.4.4 | | map_100_Vlan10 | 100 -> 10 | + +-----------------------+---------------+------------------+------------------+---------------------------------+ + +5. show vxlan remotevtep - lists all the discovered tunnels. - SIP, DIP, Creation Source, OperStatus are the columns. + - Since P2P tunnels are not created in the hardware on the flow where P2MP tunnel itself is used flooding using L2MC group, this table will not be populated. +---------+---------+-------------------+--------------+ | SIP | DIP | Creation Source | OperStatus | @@ -1092,7 +1128,7 @@ Linux kernel version 4.9.x used in SONiC requires backport of a few patches to s +---------+---------+-------------------+--------------+ Total count : 2 -5. show vxlan remote_mac +6. show vxlan remote_mac - lists all the MACs learnt from the specified remote ip or all the remotes for all vlans. (APP DB view) - VLAN, MAC, RemoteVTEP, VNI, Type are the columns. @@ -1125,7 +1161,7 @@ Linux kernel version 4.9.x used in SONiC requires backport of a few patches to s Total count : 2 -6. show vxlan remote_vni +7. show vxlan remote_vni - lists all the VLANs learnt from the specified remote ip or all the remotes. (APP DB view) - VLAN, RemoteVTEP, VNI are the columns @@ -1147,7 +1183,35 @@ Linux kernel version 4.9.x used in SONiC requires backport of a few patches to s +---------+--------------+-------+ Total count : 1 - +8. show vxlan counters(P2MP Tunnel) + +--------+---------+----------+--------+---------+----------+--------+ + | Tunnel | RX_PKTS | RX_BYTES | RX_PPS | TX_PKTS | TX_BYTES | TX_PPS | + +========+=========+==========+========+=========+==========+========+ + | Vtep1 | 1234 | 1512034 | 10/s | 2234 | 2235235 | 23/s | + +--------+---------+----------+--------+---------+----------+--------+ + +9. show vxlan counters(P2P Tunnels) + +--------------+---------+----------+--------+---------+----------+--------+ + | Tunnel | RX_PKTS | RX_BYTES | RX_PPS | TX_PKTS | TX_BYTES | TX_PPS | + +==============+=========+==========+========+=========+==========+========+ + | EVPN_2.2.2.2 | 1234 | 1512034 | 10/s | 2234 | 2235235 | 23/s | + +--------------+---------+----------+--------+---------+----------+--------+ + | EVPN_3.2.3.2 | 2344 | 162034 | 15/s | 200 | 55235 | 2/s | + +--------------+---------+----------+--------+---------+----------+--------+ + | EVPN_2.2.2.2 | 9853 | 9953260 | 27/s | 8293 | 7435211 | 18/s | + +--------------+---------+----------+--------+---------+----------+--------+ + + +10. show vxlan counters EVPN_5.1.6.8 (Per P2P Tunnel) + EVPN_5.1.6.8 + --------- + + RX: + 13 packets + N/A bytes + TX: + 1,164 packets + N/A bytes ``` ### 5.2 KLISH CLI @@ -1385,18 +1449,26 @@ To support warm boot, all the sai_objects must be uniquely identifiable based on - Verify that there is a SAI_OBJECT_TYPE_BRIDGE_PORT pointing to the above created P2P tunnel. - Verify that there is a SAI_OBJECT_TYPE_VLAN_MEMBER entry for the vlan corresponding to the VNI created and pointing to the above bridge port. 7. Add more REMOTE_VNI table entries to different Remote IP. - - Verify that additional SAI_OBJECT_TYPE_TUNNEL, BRIDGEPORT and VLAN_MEMBER objects are created. + - Verify that additional SAI_OBJECT_TYPE_TUNNEL, BRIDGEPORT and VLAN_MEMBER objects are created in case of platforms that create dynamic P2P tunnels on type 3 routes. + - Verify that vlan flood type is set to SAI_VLAN_FLOOD_CONTROL_TYPE_COMBINED. Verify that L2MC group is created and SAI_OBJECT_TYPE_L2MC_GROUP_MEMBER with end point IP and P2MP bridge port is created and set in vlan's unknown unicast and broadcast flood group in case of platforms that use P2MP tunnel on type 3 routes. 8. Add more REMOTE_VNI table entries to the same Remote IP. - - Verify that additional SAI_OBJECT_TYPE_VLAN_MEMBER entries are created pointing to the already created BRIDGEPORT object per remote ip. -9. Remove the additional entries created above and verify that the created VLAN_MEMBER entries are deleted. -10. Remove the last REMOTE_VNI entry for a DIP and verify that the created VLAN_MEMBER, TUNNEL, BRIDGEPORT ports are deleted. + - Verify that additional SAI_OBJECT_TYPE_VLAN_MEMBER entries are created pointing to the already created BRIDGEPORT object per remote ip in case of platforms that create dynamic P2P tunnels on type 3 routes. + - Verify that additional SAI_OBJECT_TYPE_L2MC_GROUP_MEMBER entries are created per remote ip with P2MP bridge port in case of platforms that use P2MP tunnel on type 3 routes. +9. Remove the additional entries created above + - Verify that the created VLAN_MEMBER entries are deleted in case of platforms that create VLAN_MEMBER. + - Verify that L2MC_GROUP_MEMBER entries are deleted in case of platforms creating SAI_OBJECT_TYPE_L2MC_GROUP_MEMBER per end point IP. +10. Remove the last REMOTE_VNI entry for a DIP + - Verify that the created VLAN_MEMBER, TUNNEL, BRIDGEPORT ports are deleted for platforms that use P2P Tunnels. + - Verify that L2MC_GROUP_MEMBERS are removed, L2MC_GROUP is deleted and vlan's flood group are set to null object as well as vlan's flood type is updated to SAI_VLAN_FLOOD_CONTROL_TYPE_ALL in case of platforms that use P2MP tunnel. ### 8.2 FdbOrch 1. Create a VXLAN_REMOTE_VNI entry to a remote destination IP. 2. Add VXLAN_REMOTE_MAC entry to the above remote IP and VLAN. - - Verify ASIC DB table fdb entry is created with remote_ip and bridgeport information. + - Verify ASIC DB table fdb entry is created with remote_ip and bridgeport information. + - In case of platforms that use P2P tunnel, verify that P2P tunnel's bridgeport is used. + - In case of platforms that use P2MP tunnel, verify that P2MP tunnel's bridge port is used. 3. Remove the above MAC entry and verify that the corresponding ASIC DB entry is removed. 4. Repeat above steps for remote static MACs. 5. Add MAC in the ASIC DB and verify that the STATE_DB MAC_TABLE is updated. diff --git a/doc/vxlan/EVPN/images/p2mptunnelcreate.jpg b/doc/vxlan/EVPN/images/p2mptunnelcreate.jpg new file mode 100644 index 0000000000..8c0322eb8a Binary files /dev/null and b/doc/vxlan/EVPN/images/p2mptunnelcreate.jpg differ diff --git a/doc/vxlan/EVPN/images/p2mpvlanextension.jpg b/doc/vxlan/EVPN/images/p2mpvlanextension.jpg new file mode 100644 index 0000000000..1b27ea34da Binary files /dev/null and b/doc/vxlan/EVPN/images/p2mpvlanextension.jpg differ diff --git a/doc/vxlan/Overlay ECMP with BFD.md b/doc/vxlan/Overlay ECMP with BFD.md new file mode 100644 index 0000000000..4485cd6237 --- /dev/null +++ b/doc/vxlan/Overlay ECMP with BFD.md @@ -0,0 +1,360 @@ +# Overlay ECMP with BFD monitoring +## High Level Design Document +### Rev 1.1 + +# Table of Contents + + * [Revision](#revision) + + * [About this Manual](#about-this-manual) + + * [Definitions/Abbreviation](#definitionsabbreviation) + + * [1 Requirements Overview](#1-requirements-overview) + * [1.1 Usecase](#11-usecase) + * [1.2 Functional requirements](#12-functional-requirements) + * [1.3 CLI requirements](#13-cli-requirements) + * [1.4 Warm Restart requirements ](#14-warm-restart-requirements) + * [1.5 Scaling requirements ](#15-scaling-requirements) + * [1.6 SAI requirements ](#16-sai-requirements) + * [2 Modules Design](#2-modules-design) + * [2.1 Config DB](#21-config-db) + * [2.2 App DB](#22-app-db) + * [2.3 Module Interaction](#23-module-interaction) + * [2.4 Orchestration Agent](#24-orchestration-agent) + * [2.5 Monitoring and Health](#25-monitoring-and-health) + * [2.6 BGP](#26-bgp) + * [2.7 CLI](#27-cli) + * [2.8 Test Plan](#28-test-plan) + +###### Revision + +| Rev | Date | Author | Change Description | +|:---:|:-----------:|:------------------:|-----------------------------------| +| 0.1 | 09/09/2021 | Prince Sunny | Initial version | +| 1.0 | 09/13/2021 | Prince Sunny | Revised based on review comments | +| 1.1 | 10/08/2021 | Prince Sunny | BFD section seperated | +| 1.2 | 10/18/2021 | Prince Sunny/Shi Su | Test Plan added | +| 1.3 | 11/01/2021 | Prince Sunny | IPv6 test cases added | +| 1.4 | 12/03/2021 | Prince Sunny | Added scaling section, extra test cases | + +# About this Manual +This document provides general information about the Vxlan Overlay ECMP feature implementation in SONiC with BFD support. This is an extension to the existing VNET Vxlan support as defined in the [Vxlan HLD](https://github.com/Azure/SONiC/blob/master/doc/vxlan/Vxlan_hld.md) + + +# Definitions/Abbreviation +###### Table 1: Abbreviations +| | | +|--------------------------|--------------------------------| +| BFD | Bidirectional Forwarding Detection | +| VNI | Vxlan Network Identifier | +| VTEP | Vxlan Tunnel End Point | +| VNet | Virtual Network | + + +# 1 Requirements Overview + +## 1.1 Usecase + +Below diagram captures the use-case. In this, ToR is a Tier0 device and Leaf is a Tier1 device. Vxlan tunnel is established from Leaf (Tier1) to a VTEP endpoint. ToR (Tier0), Spine (Tier3) are transit devices. + + +![](https://github.com/Azure/SONiC/blob/master/images/vxlan_hld/OverlayEcmp_UseCase.png) + +### Packet flow + +- The packets destined to the Tunnel Enpoint shall be Vxlan encapsulated by the Leaf (Tier1). +- Return packet from the Tunnel Endpoint (LBs) back to Leaf may or may not be Vxlan encapsualted. +- Some flows e.g. BFD over Vxlan shall require decapsulating Vxlan packets at Leaf. + +## 1.2 Functional requirements + +At a high level the following should be supported: + +- Configure ECMP with Tunnel Nexthops (IPv4 and IPv6) +- Support IPv6 tunnel that can support both IPv4 and IPv6 traffic +- Tunnel Endpoint monitoring via BFD +- Add/Withdraw Nexthop based on Tunnel or Endpoint health + +## 1.3 CLI requirements +- User should be able to show the Vnet routes +- This is an enhancement to existing show command + +## 1.4 Warm Restart requirements +No special handling for Warm restart support. + +## 1.5 Scaling requirements +At a minimum level, the following are the estimated scale numbers + +| Item | Expected value | +|--------------------------|-----------------------------| +| ECMP groups | 512 | +| ECMP group member | 128 | +| Tunnel (Overlay) routes | 16k | +| Tunnel endpoints | 4k | +| BFD monitoring | 4k | + +## 1.6 SAI requirements +In addition to supporting Overlay ECMP (TUNNEL APIs) and BFD (HW OFFLOAD), the platform must support the following SAI attributes +| API | +|--------------------------| +| SAI_SWITCH_ATTR_VXLAN_DEFAULT_ROUTER_MAC | +| SAI_SWITCH_ATTR_VXLAN_DEFAULT_PORT | + + +# 2 Modules Design + +The following are the schema changes. + +## 2.1 Config DB + +Existing Vxlan and Vnet tables. + +### 2.1.1 VXLAN Table +``` +VXLAN_TUNNEL|{{tunnel_name}} + "src_ip": {{ip_address}} + "dst_ip": {{ip_address}} (OPTIONAL) +``` +### 2.1.2 VNET/Interface Table +``` +VNET|{{vnet_name}} + "vxlan_tunnel": {{tunnel_name}} + "vni": {{vni}} + "scope": {{"default"}} (OPTIONAL) + "peer_list": {{vnet_name_list}} (OPTIONAL) + "advertise_prefix": {{false}} (OPTIONAL) +``` + +## 2.2 APP DB + +### VNET + +The following are the changes for Vnet Route table + +Existing: + +``` +VNET_ROUTE_TUNNEL_TABLE:{{vnet_name}}:{{prefix}} + "endpoint": {{ip_address}} + "mac_address":{{mac_address}} (OPTIONAL) + "vni": {{vni}}(OPTIONAL) +``` + +Proposed: +``` +VNET_ROUTE_TUNNEL_TABLE:{{vnet_name}}:{{prefix}} + "endpoint": {{ip_address1},{ip_address2},...} + "endpoint_monitor": {{ip_address1},{ip_address2},...} (OPTIONAL) + "mac_address":{{mac_address1},{mac_address2},...} (OPTIONAL) + "vni": {{vni1},{vni2},...} (OPTIONAL) + "weight": {{w1},{w2},...} (OPTIONAL) + “profile”: {{profile_name}} (OPTIONAL) +``` + +``` +key = VNET_ROUTE_TUNNEL_TABLE:vnet_name:prefix ; Vnet route tunnel table with prefix +; field = value +ENDPOINT = list of ipv4 addresses ; comma separated list of endpoints +ENDPOINT_MONITOR = list of ipv4 addresses ; comma separated list of endpoints, space for empty/no monitoring +MAC_ADDRESS = 12HEXDIG ; Inner dst mac in encapsulated packet +VNI = DIGITS ; VNI value in encapsulated packet +WEIGHT = DIGITS ; Weights for the nexthops, comma separated (Optional) +PROFILE = STRING ; profile name to be applied for this route, for community + string etc (Optional) +``` + +## 2.3 Module Interaction + +Overlay routes can be programmed via RestAPI or gNMI/gRPC interface which is not described in this document. A highlevel module interaction is shown below + +![](https://github.com/Azure/SONiC/blob/master/images/vxlan_hld/OverlayEcmp_ModuleInteraction.png) + +## 2.4 Orchestration Agent +Following orchagents shall be modified. + +### VnetOrch + +#### Requirements + +- Vnetorch to add support to handle multiple endpoints for APP_VNET_RT_TUNNEL_TABLE_NAME based route task. +- Reuse Nexthop tunnel based on the endpoint configuration. +- If there is already the same endpoint exists, use that as member for Nexthop group. +- Similar to above, reuse nexthop group, if multiple routes are programmed with the same set of nexthops. +- Provide support for endpoint modification for a route prefix. Require SAI support for SET operation of routes. +- Provide support for endpoint deletion for a route prefix. Orchagent shall check the existing entries and delete any tunnel/nexthop based on the new route update +- Ensure backward compatibility with single endpoint routes +- Use SAI_NEXT_HOP_GROUP_MEMBER_ATTR_WEIGHT for specifying weights to nexthop member +- Desirable to have per tunnel stats via sai_tunnel_stat_t + +#### Detailed flow + +VnetOrch is one of the critical module for supporting overlay ecmp. VnetOrch subscribes to VNET and ROUTE updates from APP_DB. + +When a new route update is processed by the add operation, + +1. VnetOrch checks the nexthop group and if it exists, reuse the group +2. For a new nexthop group member, add the ECMP member and identify the corresponding monitoring IP address. Create a mapping between the monitoring IP and nexthop tunnel endpoint. +3. Initiate a BFD session for the monitoring IP if it does not exist +4. Based on the BFD implementation (BfdOrch vs Control plane BFD), subscribe to BFD state change, either directly as subject observer (similar to port oper state notifications in orchagent) or via STATEDB update. +5. Based on the VNET global configuration to advertise prefixes, indicate to STATEDB if the prefix must be advertised by BGP/FRR only if there is atleast one active nexthop. Remove this entry if there are no active nexthops indicated by BFD session down so that the network pfx is no longer advertised. + +#### Monitoring Endpoint Mapping + +VNET_ROUTE_TUNNEL_TABLE can provide monitoring endpoint IPs which can be different from the tunnel termination endpoints. VnetOrch creates a mapping for such endpoints and based on the monitoring endpoint (MonEP1) health, proceed with adding/removing nexthop tunnel endpoint (EP1) from the ECMP group for the respective prefix. It is assumed that for one tunnel termination endpoint (EP1), there shall be only one corresponding monitoring endpoint (MonEP1). + +#### Pros of SWSS to handle route update based on tunnel nexthop health: + +- No significant changes, if BFD session management is HW offload via SAI notifications or Control Plane assisted. +- Similar to NHFLAGS handling for existing route ECMP group +- Better performance in re-programming routes in ASIC instead of separate process to monitor and modify each route prefix by updating DB entries + +### Bfd HW offload + +This design requires endpoint health monitoring by setting BFD sessions via HW offload. Details of BFD orchagent and HW offloading is captured in this [document](https://github.com/Azure/SONiC/blob/master/doc/bfd/BFD%20HW%20Offload%20HLD.md) + + +## 2.5 Monitoring and Health + +The routes are programmed based on the health of tunnel endpoints. It is possible that a tunnel endpoint health is monitored via another dedicated “monitoring” endpoint. Implementation shall enforce a “keep-alive” mechanism to monitor the health of end point and withdraw or reinstall the route when the endpoint is inactive or active respectively. +When an endpoint is deemed unhealthy, router shall perform the following actions: +1. Remove the nexthop from the ECMP path. If all endpoints are down, the route shall be withdrawn. +2. If 50% of the nexthops are down, an alert shall be generated. + +## 2.6 BGP + +Advertise VNET routes +The overlay routes programmed on the device must be advertised to BGP peers. This can be achieved by the “network” command. + +For example: +``` +router bgp 1 + address-family ipv4 unicast + network 10.0.0.0/8 + exit-address-family + ``` + +This configuration example says that network 10.0.0.0/8 will be announced to all neighbors. FRR bgpd doesn’t care about IGP routes when announcing its routes. + + +## 2.7 CLI + +The following commands shall be modified/added : + +``` + - show vnet routes all + - show vnet routes tunnel +``` + +Config commands for VNET, VNET Routes and BFD session is not considered in this design. This shall be added later based on requirement. + +## 2.8 Test Plan + +Pre-requisite: + +Create VNET and Vxlan tunnel as an below: + +``` +{  + "VXLAN_TUNNEL": { + "tunnel_v4": { + "src_ip": "10.1.0.32" + } + }, + + "VNET": { + "Vnet_3000": { + "vxlan_tunnel": "tunnel_v4", + "vni": "3000", + "scope": "default" + } +    } +``` +Similarly for IPv6 tunnels + +``` +{  + "VXLAN_TUNNEL": { + "tunnel_v6": { + "src_ip": "fc00:1::32" + } + }, + + "VNET": { + "Vnet_3001": { + "vxlan_tunnel": "tunnel_v6", + "vni": "3001", + "scope": "default" + } +    } +``` + +Note: It can be safely assumed that only one type of tunnel exists - i.e, either IPv4 or IPv6 for this use-case + +For ```default``` scope, no need to associate interfaces to a VNET + +VNET tunnel routes must be created as shown in the example below + +``` +[ +    "VNET_ROUTE_TUNNEL_TABLE:Vnet_3000:100.100.2.1/32": {  +        "endpoint": "1.1.1.2",  + "endpoint_monitor": "1.1.2.2" +    }  +] +``` + +With IPv6 tunnels, prefixes can be either IPv4 or IPv6 + +``` +[ +    "VNET_ROUTE_TUNNEL_TABLE:Vnet_3001:100.100.2.1/32": {  +        "endpoint": "fc02:1000::1",  + "endpoint_monitor": "fc02:1000::2" +    }, + "VNET_ROUTE_TUNNEL_TABLE:Vnet_3001:20c0:a820:0:80::/64": {  +        "endpoint": "fc02:1001::1",  + "endpoint_monitor": "fc02:1001::2" +    } +] +``` + +### Test Cases + +#### Overlay ECMP + +It is assumed that the endpoint IPs may not have exact match underlay route but may have an LPM underlay route or a default route. Test must consider both IPv4 and IPv6 traffic for routes configured as example shown above + +| Step | Goal | Expected results | +|-|-|-| +|Create a tunnel route to a single endpoint a. Send packets to the route prefix dst| Tunnel route create | Packets are received only at endpoint a | +|Set the tunnel route to another endpoint b. Send packets to the route prefix dst | Tunnel route set | Packets are received only at endpoint b | +|Remove the tunnel route. Send packets to the route prefix dst | Tunnel route remove | Packets are not received at any ports with dst IP of b | +|Create tunnel route 1 with two endpoints A = {a1, a2}. Send multiple packets (varying tuple) to the route 1's prefix dst. | ECMP route create | Packets are received at both a1 and a2 | +|Create tunnel route 2 to endpoint group A Send multiple packets (varying tuple) to route 2’s prefix dst | ECMP route create | Packets are received at both a1 and a2 | +|Set tunnel route 2 to endpoint group B = {b1, b2}. Send packets to route 2’s prefix dst | ECMP route set | Packets are received at either b1 or b2 | +|Send packets to route 1’s prefix dst. By removing route 2 from group A, no change expected to route 1 | NHG modify | Packets are received at either a1 or a2 | +|Set tunnel route 2 to single endpoint b1. Send packets to route 2’s prefix dst | NHG modify | Packets are recieved at b1 only | +|Set tunnel route 2 to shared endpoints a1 and b1. Send packets to route 2’s prefix dst | NHG modify | Packets are recieved at a1 or b1 | +|Remove tunnel route 2. Send packets to route 2’s prefix dst | ECMP route remove | Packets are not recieved at any ports with dst IP of a1 or b1 | +|Set tunnel route 3 to endpoint group C = {c1, c2, c3}. Ensure c1, c2, and c3 matches to underlay default route. Send 10000 pkt with random hash to route 3's prefix dst | NHG distribution | Packets are distributed equally across c1, c2 and c3 | +|Modify the underlay default route nexthop/s. Send packets to route 3's prefix dst | Underlay ECMP | No change to packet distribution. Packets are distributed equally across c1, c2 and c3 | +|Remove the underlay default route. | Underlay ECMP | Packets are not recieved at c1, c2 or c3 | +|Re-add the underlay default route. | Underlay ECMP | Packets are equally recieved at c1, c2 or c3 | +|Bring down one of the port-channels. | Underlay ECMP | Packets are equally recieved at c1, c2 or c3 | +|Create a more specific underlay route to c1. | Underlay ECMP | Verify c1 packets are received only on the c1's nexthop interface | +|Create tunnel route 4 to endpoint group A Send packets (fixed tuple) to route 4’s prefix dst | Vxlan Entropy | Verify Vxlan entropy| +|Change the udp src port of original packet to route 4’s prefix dst | Vxlan Entropy | Verify Vxlan entropy is changed| +|Change the udp dst port of original packet to route 4’s prefix dst | Vxlan Entropy | Verify Vxlan entropy is changed| +|Change the src ip of original packet to route 4’s prefix dst | Vxlan Entropy | Verify Vxlan entropy is changed| +|Create/Delete overlay routes to 16k with unique endpoints upto 4k | CRM | Verify crm resourse for route (ipv4/ipv6) and nexthop (ipv4/ipv6) | +|Create/Delete overlay nexthop groups upto 512 | CRM | Verify crm resourse for nexthop_group | +|Create/Delete overlay nexthop group members upto 128 | CRM | Verify crm resourse for nexthop_group_member | + +#### BFD and health monitoring + +TBD + +#### BGP advertising + +TBD diff --git a/doc/xrcvd/transceiver-monitor-hld.md b/doc/xrcvd/transceiver-monitor-hld.md index 97b875c3d8..b5ea73097f 100644 --- a/doc/xrcvd/transceiver-monitor-hld.md +++ b/doc/xrcvd/transceiver-monitor-hld.md @@ -237,7 +237,7 @@ A thread will be started to periodically refresh the DOM sensor information. Detailed flow as showed in below chart: -![](https://github.com/keboliu/SONiC/blob/master/images/xcvrd-flow.svg) +![](https://github.com/Azure/SONiC/blob/d1159ca728112f10319fa47de4df89c445a27efc/images/transceiver_monitoring_hld/xcvrd_flow.svg) #### 1.4.1 State machine of sfp\_state\_update\_task process #### diff --git a/doc/ztp/ztp.md b/doc/ztp/ztp.md index 53c41b8ffd..053c98b3f2 100644 --- a/doc/ztp/ztp.md +++ b/doc/ztp/ztp.md @@ -715,7 +715,7 @@ If user does not provide both DHCP option 67 or DHCP option 239, ZTP service con Following is the order in which DHCP options are processed: -1. The ZTP JSON file specified in pre-defined location as part of the image Local file on disk */host/ztp/ztp_local_data.json*. +1. The ZTP JSON file specified in pre-defined location as part of the image Local file on disk */host/ztp/ztp_data_local.json*. 2. ZTP JSON URL specified via DHCP Option-67 3. ZTP JSON URL constructed using DHCP Option-66 TFTP server name, DHCP Option-67 file path on TFTP server 4. ZTP JSON URL specified via DHCPv6 Option-59 diff --git a/images/VM_image2.png b/images/VM_image2.png index c0be5e5c21..33308c93cf 100644 Binary files a/images/VM_image2.png and b/images/VM_image2.png differ diff --git a/images/ecmp/order_ecmp_pic.png b/images/ecmp/order_ecmp_pic.png new file mode 100644 index 0000000000..a4166331fb Binary files /dev/null and b/images/ecmp/order_ecmp_pic.png differ diff --git a/sonic_latest_images.html b/sonic_latest_images.html index 0f331d5aa2..5636a5f36f 100644 --- a/sonic_latest_images.html +++ b/sonic_latest_images.html @@ -45,8 +45,8 @@
-

Latest Successful Builds


-

NOTE: This page is updated manually once in a while and hence may not be pointing to the latest MASTER image. The current links are based on 16thOct2021 successful builds. To get the latest master image, refer pipelines page. +

Latest Successful Builds

+
@@ -67,7 +67,10 @@

Latest Successful Builds


- click here for previous builds
+ NOTE: The 5 digit number given in the cells specifies the build Id of the images. +

+

+ click here for previous builds

@@ -88,18 +91,21 @@

Latest Successful Builds


images = Object.keys(data[branches[i]]); for (let j = 0; j < images.length; j++) { image_name = images[j]; - image = data[branches[i]][images[j]]; + image = data[branches[i]][images[j]]; image_platform = image_name.split(".")[0]; + image_platform2 = image_name; if(image_platform.length == 1){ platform = "" }else{ platform = image_platform.split("sonic-")[1]; + platform2 = image_platform2.split("sonic-")[1]; if(platform.length == 1){ platform = "" } } image_avail = true; image_url = image['url']; + build_id = image['build']; if(image_url === 'null' || image_url === ""){ image_avail = false; } @@ -108,7 +114,7 @@

Latest Successful Builds


$("#disp_table").append(platform_column); } if (image_avail) - image_column =""+image_name+""; + image_column =""+platform2+"-"+build_id+""; else image_column ="N/A"; diff --git a/supported_devices_platforms_md.sh b/supported_devices_platforms_md.sh new file mode 100644 index 0000000000..fa3d0f76e0 --- /dev/null +++ b/supported_devices_platforms_md.sh @@ -0,0 +1,185 @@ +!/usr/bin/env bash +git checkout -b sonic_image_md_update +git config --global user.email "xinxliu@microsoft.com" +git config --global user.name "xinliu-seattle" +git reset --hard +git pull origin sonic_image_md_update + + +#set -euo pipefail + +DEFID_BRCM="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.broadcom' | jq -r '.value[0].id')" +DEFID_MLNX="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.mellanox' | jq -r '.value[0].id')" +DEFID_VS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.vs' | jq -r '.value[0].id')" +DEFID_INNO="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.innovium' | jq -r '.value[0].id')" +DEFID_BFT="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.barefoot' | jq -r '.value[0].id')" +DEFID_CHE="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.cache' | jq -r '.value[0].id')" +DEFID_CTC="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.centec' | jq -r '.value[0].id')" +DEFID_CTC64="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.centec-arm64' | jq -r '.value[0].id')" +DEFID_GRC="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.generic' | jq -r '.value[0].id')" +DEFID_MRV="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.marvell-armhf' | jq -r '.value[0].id')" +DEFID_NPH="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/definitions?name=Azure.sonic-buildimage.official.nephos' | jq -r '.value[0].id')" + +first=1 +for BRANCH in master +do + first='' + BUILD_BRCM="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_BRCM}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_BRCM_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_BRCM}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_MLNX="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_MLNX}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_MLNX_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_MLNX}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_VS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_VS}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_VS_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_VS}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_INNO="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_INNO}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_INNO_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_INNO}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_BFT="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_BFT}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_BFT_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_BFT}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_CHE="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_CHE}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_CHE_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_CHE}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_CTC="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_CTC}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_CTC_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_CTC}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_CTC64="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_CTC64}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_CTC64_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_CTC64}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_GRC="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_GRC}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_GRC_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_GRC}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_MRV="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_MRV}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_MRV_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_MRV}"'?api-version=6.0' | jq -r '.queueTime')" + BUILD_NPH="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds?definitions='"${DEFID_NPH}"'&branchName=refs/heads/'"${BRANCH}"'&$top=1&resultFilter=succeeded&api-version=6.0' | jq -r '.value[0].id')" + BUILD_NPH_TS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_NPH}"'?api-version=6.0' | jq -r '.queueTime')" + + #echo " [*] Last successful builds for \"${BRANCH}\":" + #echo " Broadcom: ${BUILD_BRCM}" + #echo " Mellanox: ${BUILD_MLNX}" + #echo " Virtual Switch: ${BUILD_VS}" + + ARTF_BRCM="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_BRCM}"'/artifacts?artifactName=sonic-buildimage.broadcom&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_MLNX="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_MLNX}"'/artifacts?artifactName=sonic-buildimage.mellanox&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_VS="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_VS}"'/artifacts?artifactName=sonic-buildimage.vs&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_INNO="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_INNO}"'/artifacts?artifactName=sonic-buildimage.innovium&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_BFT="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_BFT}"'/artifacts?artifactName=sonic-buildimage.barefoot&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_CHE="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_CHE}"'/artifacts?artifactName=sonic-buildimage.cache&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_CTC="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_CTC}"'/artifacts?artifactName=sonic-buildimage.centec&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_CTC64="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_CTC64}"'/artifacts?artifactName=sonic-buildimage.centec-arm64&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_GRC="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_GRC}"'/artifacts?artifactName=sonic-buildimage.generic&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_MRV="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_MRV}"'/artifacts?artifactName=sonic-buildimage.marvell-armhf&api-version=5.1' | jq -r '.resource.downloadUrl')" + ARTF_NPH="$(curl -s 'https://dev.azure.com/mssonic/build/_apis/build/builds/'"${BUILD_NPH}"'/artifacts?artifactName=sonic-buildimage.nephos&api-version=5.1' | jq -r '.resource.downloadUrl')" + +echo "# Supported Platforms" > supported_devices_platforms.md + +echo "#### Following is the list of platforms that supports SONiC." >> supported_devices_platforms.md +echo "| S.No | Vendor | Platform | ASIC Vendor | Switch ASIC | Port Configuration | Image |" >> supported_devices_platforms.md +echo "| ---- | -------------- | ----------- | ----------------- | ----------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |" >> supported_devices_platforms.md +echo "| 1 | Accton | AS4630-54PE | Broadcom | Helix 5 | 48x1G + 4x25G + 2x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 2 | Accton | AS5712-54X | Broadcom | Trident 2 | 72x10G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 3 | Accton | AS5812-54X | Broadcom | Trident 2 | 72x10G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 4 | Accton | AS5835-54T | Broadcom | Trident 3 | 48x10G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 5 | Accton | AS5835-54X | Broadcom | Trident 3 | 48x10G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 6 | Accton | AS6712-32X | Broadcom | Trident 2 | 32x40G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 7 | Accton | AS7116-54X | Nephos | Taurus | 48x25G + 6x100G | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 8 | Accton | AS7312-54X | Broadcom | Tomahawk | 48x25G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 9 | Accton | AS7312-54XS | Broadcom | Tomahawk | 48x25G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 10 | Accton | AS7315-27XB | Broadcom | Qumran | 20x10G + 4x25G + 3x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 11 | Accton | AS7326-56X | Broadcom | Trident 3 | 48x25G + 8x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 12 | Accton | AS7512-32X | Cavium | XPliantCNX880** | 32x100G | [SONiC-ONIE-Cavium](https://sonic-build.azurewebsites.net/ui/sonic/Pipelines) |" >> supported_devices_platforms.md +echo "| 13 | Accton | AS7712-32X | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 14 | Accton | AS7716-32X | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 15 | Accton | AS7716-32XB | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 16 | Accton | AS7726-32X | Broadcom | Trident 3 | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 17 | Accton | AS7816-64X | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 18 | Accton | AS9716-32D | Broadcom | Tomahawk 3 | 32x400G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 19 | Accton | Minipack | Broadcom | Tomahawk 3 | 128x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 20 | Alpha
Networks | SNH60A0-320Fv2 | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 21 | Alpha
Networks | SNH60B0-640F | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 22 | Arista | 7050QX-32 | Broadcom | Trident 2 | 32x40G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 23 | Arista | 7050QX-32S | Broadcom | Trident 2 | 32x40G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 24 | Arista | 7050CX3-32S | Broadcom | Trident 3 | 32x100G + 2x10G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 25 | Arista | 7060CX-32S | Broadcom | Tomahawk | 32x100G + 2x10G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 26 | Arista | 7060DX4-32 | Broadcom | Tomahawk 3 | 32x400G + 2x10G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 27 | Arista | 7060PX4-32 | Broadcom | Tomahawk 3 | 32x400G + 2x10G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 28 | Arista | 7170-32CD | Barefoot | Tofino | 32x100G + 2x10G | [SONiC-Aboot-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 29 | Arista | 7170-64C | Barefoot | Tofino | 64x100G + 2x10G | [SONiC-Aboot-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 30 | Arista | 7260CX3-64 | Broadcom | Tomahawk 2 | 64x100G + 2x10G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 31 | Arista | 7280CR3-32D4 | Broadcom | Jericho 2 | 32x100G + 4x400G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 32 | Arista | 7280CR3-32P4 | Broadcom | Jericho 2 | 32x100G + 4x400G | [SONiC-Aboot-Broadcom]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-aboot-barefoot.swi/')) |" >> supported_devices_platforms.md +echo "| 33 | Barefoot | SONiC-P4 | Barefoot | P4 Emulated | Configurable | [SONiC-P4](https://sonic-build.azurewebsites.net/ui/sonic/Pipelines) |" >> supported_devices_platforms.md +echo "| 34 | Barefoot | Wedge 100BF-32 | Barefoot | Tofino | 32x100G | [SONiC-ONIE-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-barefoot.bin/')) |" >> supported_devices_platforms.md +echo "| 35 | Barefoot | Wedge 100BF-65X | Barefoot | Tofino | 32x100G | [SONiC-ONIE-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-barefoot.bin/')) |" >> supported_devices_platforms.md +echo "| 36 | Celestica | DX010 | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 37 | Celestica | E1031 | Broadcom | Helix4 | 48x1G + 4x10G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 38 | Celestica | midstone-200i | Innovium | Teralynx 7 | 128x100G |[SONiC-ONIE-Innovium]($(echo "${ARTF_INNO}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-innovium-dbg.bin/')) |" >> supported_devices_platforms.md +echo "| 39 | Celestica | Silverstone | Broadcom | Tomahawk 3 | 32x400G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 40 | Celestica | Seastone_2 | Broadcom | Trident 3 | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 41 | Centec | E582-48X2Q | Centec | Goldengate | 48x10G + 2x40G + 4x100G | [SONiC-ONIE-Centec]($(echo "${ARTF_CTC}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-centec.bin/')) |" >> supported_devices_platforms.md +echo "| 42 | Centec | E582-48X6Q | Centec | Goldengate | 48x10G + 6x40G | [SONiC-ONIE-Centec]($(echo "${ARTF_CTC}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-centec.bin/')) |" >> supported_devices_platforms.md +echo "| 43 | Cig | CS6436-56P | Nephos | NP8366 | 48x25G + 8x100G | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 44 | Cig | CS5435-54P | Nephos | NP8363 | 10GX48,100GX6 | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 45 | Cig | CS6436-54P | Nephos | NP8365 | 25GX48,100GX6 | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 46 | Dell | N3248PXE | Broadcom | Trident 3.X5 | 48x10GCU+4x25G-2x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 47 | Dell | N3248TE | Broadcom | Trident 3.X3 | 48x1G+4x10G-2x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 48 | Dell | S5212F | Broadcom | Trident 3.X5 | 12x25G+3x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 49 | Dell | S5224F | Broadcom | Trident 3.X5 | 24x25G+4x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 50 | Dell | S5232F-ON | Broadcom | Trident 3 | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 51 | Dell | S5248F-ON | Broadcom | Trident 3-2T | 48x25G,4x100G,2x200G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 52 | Dell | s5296F | Broadcom | Trident 3 | 96x25G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 53 | Dell | S6000-ON | Broadcom | Trident 2 | 32x40G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 54 | Dell | S6100-ON | Broadcom | Tomahawk | 64x40G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 55 | Dell | Z9100-ON | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 56 | Dell | Z9264F-ON | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 57 | Dell | Z9332F-ON | Broadcom | Tomahawk 3 | 32x400G,2x10G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 58 | Dell | Z9332f-C32 | Broadcom | Tomahawk 3 | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 59 | Delta | AG5648 | Broadcom | Tomahawk | 48x25G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 60 | Delta | AG9032V1 | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 61 | Delta | AG9032V2A | Broadcom | Trident 3 | 32x100G + 1x10G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 62 | Delta | AG9064 | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 63 | Delta | et-c032if | Innovium | Teralynx 7 | 32x400G |[SONiC-ONIE-Innovium]($(echo "${ARTF_INNO}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-innovium-dbg.bin/')) |" >> supported_devices_platforms.md +echo "| 64 | Delta | ET-6448M | Marvell | Prestera 98DX3255 | 48xGE + 4x10G | [SONiC-ONIE-Marvell]($(echo "${ARTF_MRV}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-marvell-armhf.bin/')) |" >> supported_devices_platforms.md +echo "| 65 | Delta | agc032 | Broadcom | Tomahawk3 | 32x400G + 2x10G | [SONiC-ONIE-Marvell]($(echo "${ARTF_MRV}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-marvell-armhf.bin/')) |" >> supported_devices_platforms.md +echo "| 66 | Embedway | ES6220 (48x10G) | Centec | Goldengate | 48x10G + 2x40G + 4x100G | [SONiC-ONIE-Centec]($(echo "${ARTF_CTC}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-centec.bin/')) |" >> supported_devices_platforms.md +echo "| 67 | Embedway | ES6428A-X48Q2H4 | Centec | Goldengate | 4x100G + 2x40G + 48x10G | [SONiC-ONIE-Centec]($(echo "${ARTF_CTC}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-centec.bin/')) |" >> supported_devices_platforms.md +echo "| 68 | Facebook | Wedge 100-32X | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 69 | Ingrasys | S8810-32Q | Broadcom | Trident 2 | 32x40G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 70 | Ingrasys | S8900-54XC | Broadcom | Tomahawk | 48x25G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 71 | Ingrasys | S8900-64XC | Broadcom | Tomahawk | 48x25G + 16x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 72 | Ingrasys | S9100-32X | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 73 | Ingrasys | S9130-32X | Nephos | Taurus | 32x100G | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 74 | Ingrasys | S9180-32X | Barefoot | Tofino | 32x100G | [SONiC-ONIE-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-barefoot.bin/')) |" >> supported_devices_platforms.md +echo "| 75 | Ingrasys | S9200-64X | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 76 | Ingrasys | S9230-64X | Nephos | Taurus | 64x100G | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 77 | Ingrasys | S9280-64X | Barefoot | Tofino | 64x100G | [SONiC-ONIE-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-barefoot.bin/')) |" >> supported_devices_platforms.md +echo "| 78 | Inventec | D6254QS | Broadcom | Trident 2 | 72x10G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 79 | Inventec | D6356 | Broadcom | Trident 3 | 48x25G + 8x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 80 | Inventec | D6556 | Broadcom | Trident 3 | 48x25G + 8x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 81 | Inventec | D7032Q | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 82 | Inventec | D7054Q | Broadcom | Tomahawk | 48x25G + 6x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 83 | Inventec | D7264Q | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 84 | Juniper Networks| QFX5210-64C | Broadcom | Tomahawk 2 | 64x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 85 | Juniper Networks| QFX5200-32C-S | Broadcom | Tomahawk 1 | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 86 | Marvell | RD-ARM-48XG6CG-A4 | Marvell | Prestera 98EX54xx | 6x100G+48x10G | [SONiC-ONIE-Marvell]($(echo "${ARTF_MRV}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-marvell-armhf.bin/')) |" >> supported_devices_platforms.md +echo "| 87 | Marvell | RD-BC3-4825G6CG-A4 | Marvell | Prestera 98CX84xx | 6x100G+48x25G | [SONiC-ONIE-Marvell]($(echo "${ARTF_MRV}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-marvell-armhf.bin/')) |" >> supported_devices_platforms.md +echo "| 88 | Marvell | 98cx8580 | Marvell | Prestera CX | 32x400G + 16x400G | [SONiC-ONIE-Marvell]($(echo "${ARTF_MRV}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-marvell-armhf.bin/')) |" >> supported_devices_platforms.md +echo "| 89 | Nvidia | SN2010 | Nvidia | Spectrum | 18x25G + 4x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 90 | Nvidia | SN2100 | Nvidia | Spectrum | 16x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 91 | Nvidia | SN2410 | Nvidia | Spectrum | 48x25G + 8x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 92 | Nvidia | SN2700 | Nvidia | Spectrum | 32x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 93 | Nvidia | SN3420 | Nvidia | Spectrum 2 | 48x25G + 12x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 94 | Nvidia | SN3700 | Nvidia | Spectrum 2 | 32x200G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 95 | Nvidia | SN3700C | Nvidia | Spectrum 2 | 32x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 96 | Nvidia | SN3800 | Nvidia | Spectrum 2 | 64x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 97 | Nvidia | SN4600C | Nvidia | Spectrum 3 | 64x100G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 98 | Nvidia | SN4700 | Nvidia | Spectrum 3 | 32x400G | [SONiC-ONIE-Mellanox]($(echo "${ARTF_MLNX}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-mellanox.bin/')) |" >> supported_devices_platforms.md +echo "| 99 | Mitac | LY1200-B32H0-C3 | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 100 | Pegatron | Porsche | Nephos | Taurus | 48x25G + 6x100G | [SONiC-ONIE-Nephos]($(echo "${ARTF_NPH}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-nephos.bin/')) |" >> supported_devices_platforms.md +echo "| 101 | Quanta | T3032-IX7 | Broadcom | Trident 3 | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 102 | Quanta | T4048-IX8 | Broadcom | Trident 3 | 48x25G + 8x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 103 | Quanta | T4048-IX8C | Broadcom | Trident 3 | 48x25G + 8x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 104 | Quanta | T7032-IX1B | Broadcom | Tomahawk | 32x100G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 105 | Quanta | T9032-IX9 | Broadcom | Tomahawk 3 | 32x400G | [SONiC-ONIE-Broadcom]($(echo "${ARTF_BRCM}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-broadcom.bin/')) |" >> supported_devices_platforms.md +echo "| 106 | Wnc | OSW1800 | Barefoot | Tofino | 48x25G + 6x100G | [SONiC-ONIE-Barefoot]($(echo "${ARTF_BFT}" | sed 's/format=zip/format=file\&subpath=\/target\/sonic-barefoot.bin/')) |" >> supported_devices_platforms.md + + + +done + +git add supported_devices_platforms.md +git commit -m "latest links for sonic images in supported platform md file" +git push -f --set-upstream origin sonic_image_md_update diff --git a/supported_devices_platforms_md.sh - Shortcut.lnk b/supported_devices_platforms_md.sh - Shortcut.lnk new file mode 100644 index 0000000000..2b38a482f7 Binary files /dev/null and b/supported_devices_platforms_md.sh - Shortcut.lnk differ