diff --git a/docs/metrics/plugins/infiniband.md b/docs/metrics/plugins/infiniband.md new file mode 100644 index 00000000000..aaaa6ed230c --- /dev/null +++ b/docs/metrics/plugins/infiniband.md @@ -0,0 +1,48 @@ +# `infiniband` (Linux) + +Gathers Nvidia Infiniband port counters and debug status parameters from /sys/class/infiniband and /sys/class/net (respectively). + +## Metrics + +Infiniband Port Counter Statistics + +Infiniband Status Parameter Statistics + +## Architecture + +The plugin uses the following data sources: + +1. `/sys/class/infiniband` +2. `/sys/class/net` + +### Code Locations + +- Plugin code interfacing with the Infiniband driver: *pkg/plugin/infiniband/* + +## Label Values for Infiniband Port Counters + +Below is a running list of all statistics for Infiniband port counters + +- `excessive_buffer_overrun_errors` +- `link_downed` +- `link_error_recovery` +- `local_link_integrity_errors` +- `port_rcv_constraint_errors` +- `port_rcv_data` +- `port_rcv_errors` +- `port_rcv_packets` +- `port_rcv_remote_physical_errors` +- `port_rcv_switch_replay_errors` +- `port_xmit_constraint_errors` +- `port_xmit_data` +- `port_xmit_discards` +- `port_xmit_packets` +- `symbol_error` +- `VL15_dropped` + +## Label Values for Infiniband Debug Status Parameters + +Below is a running list of all statistics for Infiniband debug status parameters + +- `lro_timeout` +- `link_down_reason` diff --git a/image-metadata-retina-agent-v0.0.5-24-g77b2e86-linux-amd64.json b/image-metadata-retina-agent-v0.0.5-24-g77b2e86-linux-amd64.json new file mode 100644 index 00000000000..e69de29bb2d diff --git a/pkg/log/test.log b/pkg/log/test.log new file mode 100644 index 00000000000..2565fe743e0 --- /dev/null +++ b/pkg/log/test.log @@ -0,0 +1,98 @@ +{"level":"info","ts":"2024-04-02T21:24:34.986-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-02T21:24:34.987-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log/zap_test.go","name":"zap_test.go"} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-02T21:24:46.824-0700","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.21.8","os":"linux","arch":"amd64","numcores":12,"hostname":"","podname":"","path":"/home/spencermckee/go/src/github.com/Azure/retina/pkg/log/zap_test.go","name":"zap_test.go"} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-23T20:47:06.365Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-23T20:47:06.366Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap_test.go","name":"zap_test.go"} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-23T20:49:40.517Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap_test.go","name":"zap_test.go"} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-23T20:56:55.674Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap_test.go","name":"zap_test.go"} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-23T22:22:49.188Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap_test.go","name":"zap_test.go"} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":0} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":1} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":2} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":3} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":4} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":5} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":6} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":7} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":8} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:29","msg":"test","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","i":9} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log","name":"log"} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/test.log","name":"test.log"} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap.go","name":"zap.go"} +{"level":"info","ts":"2024-04-23T22:23:47.486Z","caller":"log/zap_test.go:43","msg":"Filename: ","goversion":"go1.22.2","os":"linux","arch":"amd64","numcores":16,"hostname":"","podname":"","path":"/home/azureuser/retina/pkg/log/zap_test.go","name":"zap_test.go"} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 44e19f86197..6437cf952fa 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -147,6 +147,24 @@ func InitializeMetrics() { utils.DNSLabels..., ) + // InfiniBand Metrics + InfinibandCounterStats = exporter.CreatePrometheusGaugeVecForMetric( + exporter.DefaultRegistry, + utils.InfinibandCounterStatsName, + infinibandCounterStatsDescription, + utils.StatName, + utils.Device, + utils.Port, + ) + + InfinibandStatusParams = exporter.CreatePrometheusGaugeVecForMetric( + exporter.DefaultRegistry, + utils.InfinibandStatusParamsName, + infinibandStatusParamsDescription, + utils.StatName, + utils.InterfaceName, + ) + isInitialized = true metricsLogger.Info("Metrics initialized") } diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index bc08d559a9f..8fd1eaa5cdf 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -38,6 +38,8 @@ const ( nodeApiServerHandshakeLatencyDesc = "Histogram depicting latency of the TCP handshake between nodes and Kubernetes API server measured in milliseconds" dnsRequestCounterDescription = "DNS requests by statistics" dnsResponseCounterDescription = "DNS responses by statistics" + infinibandCounterStatsDescription = "InfiniBand Counter Statistics" + infinibandStatusParamsDescription = "InfiniBand Status Parameters" // Control plane metrics pluginManagerFailedToReconcileCounterDescription = "Number of times the plugin manager failed to reconcile the plugins" @@ -86,6 +88,9 @@ var ( // DNS Metrics. DNSRequestCounter ICounterVec DNSResponseCounter ICounterVec + + InfinibandCounterStats IGaugeVec + InfinibandStatusParams IGaugeVec ) func ToPrometheusType(metric interface{}) prometheus.Collector { diff --git a/pkg/plugin/infiniband/Makefile b/pkg/plugin/infiniband/Makefile new file mode 100644 index 00000000000..134f59a69c1 --- /dev/null +++ b/pkg/plugin/infiniband/Makefile @@ -0,0 +1,11 @@ +REPO_ROOT = $(shell git rev-parse --show-toplevel) +TOOLS_BIN_DIR = $(REPO_ROOT)/hack/tools/bin +MOCKGEN = $(TOOLS_BIN_DIR)/mockgen + +.PHONY: generate + +generate: $(MOCKGEN) ## Generate mock clients + $(MOCKGEN) -source=$(REPO_ROOT)/pkg/plugin/infiniband/types_linux.go -copyright_file=$(REPO_ROOT)/pkg/lib/ignore_headers.txt -package=infiniband > infiniband_mock_generated.go + +$(MOCKGEN): + @make -C $(REPO_ROOT) $(MOCKGEN) diff --git a/pkg/plugin/infiniband/infiniband_linux.go b/pkg/plugin/infiniband/infiniband_linux.go new file mode 100644 index 00000000000..d1a09b93390 --- /dev/null +++ b/pkg/plugin/infiniband/infiniband_linux.go @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Package infiniband contains the Retina infiniband plugin. It gathers infiniband statistics and debug status parameters. +package infiniband + +import ( + "context" + "sync" + "time" + + hubblev1 "github.com/cilium/cilium/pkg/hubble/api/v1" + kcfg "github.com/microsoft/retina/pkg/config" + "github.com/microsoft/retina/pkg/log" + "github.com/microsoft/retina/pkg/plugin/api" + "go.uber.org/zap" +) + +// New creates a infiniband plugin. +func New(cfg *kcfg.Config) api.Plugin { + return &infiniband{ + cfg: cfg, + l: log.Logger().Named(string(Name)), + } +} + +func (lu *infiniband) Name() string { + return string(Name) +} + +func (lu *infiniband) Generate(ctx context.Context) error { + return nil +} + +func (lu *infiniband) Compile(ctx context.Context) error { + return nil +} + +func (lu *infiniband) Init() error { + lu.l.Info("Initializing infiniband plugin...") + return nil +} + +func (lu *infiniband) Start(ctx context.Context) error { + lu.isRunning = true + return lu.run(ctx) +} + +func (lu *infiniband) SetupChannel(ch chan *hubblev1.Event) error { + lu.l.Warn("Plugin does not support SetupChannel", zap.String("plugin", string(Name))) + return nil +} + +func (lu *infiniband) run(ctx context.Context) error { + lu.l.Info("Running infiniband plugin...") + ticker := time.NewTicker(lu.cfg.MetricsInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + lu.l.Info("Context is done, infiniband will stop running") + return nil + case <-ticker.C: + var wg sync.WaitGroup + + infinibandReader := NewInfinibandReader() + wg.Add(1) + go func() { + defer wg.Done() + err := infinibandReader.readAndUpdate() + if err != nil { + lu.l.Error("Reading infiniband stats failed", zap.Error(err)) + } + }() + + wg.Wait() + } + } +} + +func (lu *infiniband) Stop() error { + if !lu.isRunning { + return nil + } + lu.l.Info("Stopping infiniband plugin...") + lu.isRunning = false + return nil +} diff --git a/pkg/plugin/infiniband/infiniband_linux_test.go b/pkg/plugin/infiniband/infiniband_linux_test.go new file mode 100644 index 00000000000..e22b94b29c3 --- /dev/null +++ b/pkg/plugin/infiniband/infiniband_linux_test.go @@ -0,0 +1,77 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build unit +// +build unit + +package infiniband + +import ( + "context" + "testing" + "time" + + kcfg "github.com/microsoft/retina/pkg/config" + + "github.com/microsoft/retina/pkg/log" + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" +) + +var ( + cfgPodLevelEnabled = &kcfg.Config{ + MetricsInterval: 1 * time.Second, + EnablePodLevel: true, + } + cfgPodLevelDisabled = &kcfg.Config{ + MetricsInterval: 1 * time.Second, + EnablePodLevel: false, + } +) + +func TestStop(t *testing.T) { + log.SetupZapLogger(log.GetDefaultLogOpts()) + p := &infiniband{ + cfg: cfgPodLevelEnabled, + l: log.Logger().Named(string(Name)), + } + err := p.Stop() + if err != nil { + t.Fatalf("Expected no error") + } + if p.isRunning { + t.Fatalf("Expected isRunning to be false") + } + + p.isRunning = true + err = p.Stop() + if err != nil { + t.Fatalf("Expected no error") + } + if p.isRunning { + t.Fatalf("Expected isRunning to be false") + } +} + +func TestShutdown(t *testing.T) { + log.SetupZapLogger(log.GetDefaultLogOpts()) + p := &infiniband{ + cfg: &kcfg.Config{ + MetricsInterval: 100 * time.Second, + EnablePodLevel: true, + }, + l: log.Logger().Named(string(Name)), + } + + ctx, cancel := context.WithCancel(context.Background()) + g, errctx := errgroup.WithContext(ctx) + + g.Go(func() error { + return p.Start(errctx) + }) + + time.Sleep(1 * time.Second) + cancel() + err := g.Wait() + require.NoError(t, err) +} diff --git a/pkg/plugin/infiniband/infiniband_mock_generated.go b/pkg/plugin/infiniband/infiniband_mock_generated.go new file mode 100644 index 00000000000..ff77414df08 --- /dev/null +++ b/pkg/plugin/infiniband/infiniband_mock_generated.go @@ -0,0 +1,120 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: types_linux.go +// +// Generated by this command: +// +// mockgen -source=types_linux.go -destination=infiniband_mock_generated.go -package=infiniband +// + +// Package infiniband is a generated GoMock package. +package infiniband + +import ( + reflect "reflect" + + netstat "github.com/cakturk/go-netstat/netstat" + gomock "go.uber.org/mock/gomock" +) + +// MockEthtoolInterface is a mock of EthtoolInterface interface. +type MockEthtoolInterface struct { + ctrl *gomock.Controller + recorder *MockEthtoolInterfaceMockRecorder +} + +// MockEthtoolInterfaceMockRecorder is the mock recorder for MockEthtoolInterface. +type MockEthtoolInterfaceMockRecorder struct { + mock *MockEthtoolInterface +} + +// NewMockEthtoolInterface creates a new mock instance. +func NewMockEthtoolInterface(ctrl *gomock.Controller) *MockEthtoolInterface { + mock := &MockEthtoolInterface{ctrl: ctrl} + mock.recorder = &MockEthtoolInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockEthtoolInterface) EXPECT() *MockEthtoolInterfaceMockRecorder { + return m.recorder +} + +// Close mocks base method. +func (m *MockEthtoolInterface) Close() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Close") +} + +// Close indicates an expected call of Close. +func (mr *MockEthtoolInterfaceMockRecorder) Close() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Close", reflect.TypeOf((*MockEthtoolInterface)(nil).Close)) +} + +// Stats mocks base method. +func (m *MockEthtoolInterface) Stats(intf string) (map[string]uint64, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Stats", intf) + ret0, _ := ret[0].(map[string]uint64) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Stats indicates an expected call of Stats. +func (mr *MockEthtoolInterfaceMockRecorder) Stats(intf any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stats", reflect.TypeOf((*MockEthtoolInterface)(nil).Stats), intf) +} + +// MockNetstatInterface is a mock of NetstatInterface interface. +type MockNetstatInterface struct { + ctrl *gomock.Controller + recorder *MockNetstatInterfaceMockRecorder +} + +// MockNetstatInterfaceMockRecorder is the mock recorder for MockNetstatInterface. +type MockNetstatInterfaceMockRecorder struct { + mock *MockNetstatInterface +} + +// NewMockNetstatInterface creates a new mock instance. +func NewMockNetstatInterface(ctrl *gomock.Controller) *MockNetstatInterface { + mock := &MockNetstatInterface{ctrl: ctrl} + mock.recorder = &MockNetstatInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockNetstatInterface) EXPECT() *MockNetstatInterfaceMockRecorder { + return m.recorder +} + +// TCPSocks mocks base method. +func (m *MockNetstatInterface) TCPSocks(accept netstat.AcceptFn) ([]netstat.SockTabEntry, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "TCPSocks", accept) + ret0, _ := ret[0].([]netstat.SockTabEntry) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// TCPSocks indicates an expected call of TCPSocks. +func (mr *MockNetstatInterfaceMockRecorder) TCPSocks(accept any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TCPSocks", reflect.TypeOf((*MockNetstatInterface)(nil).TCPSocks), accept) +} + +// UDPSocks mocks base method. +func (m *MockNetstatInterface) UDPSocks(accept netstat.AcceptFn) ([]netstat.SockTabEntry, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UDPSocks", accept) + ret0, _ := ret[0].([]netstat.SockTabEntry) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// UDPSocks indicates an expected call of UDPSocks. +func (mr *MockNetstatInterfaceMockRecorder) UDPSocks(accept any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UDPSocks", reflect.TypeOf((*MockNetstatInterface)(nil).UDPSocks), accept) +} diff --git a/pkg/plugin/infiniband/infiniband_stats_linux.go b/pkg/plugin/infiniband/infiniband_stats_linux.go new file mode 100644 index 00000000000..9f175674ea1 --- /dev/null +++ b/pkg/plugin/infiniband/infiniband_stats_linux.go @@ -0,0 +1,143 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +package infiniband + +import ( + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/microsoft/retina/pkg/log" + "github.com/microsoft/retina/pkg/metrics" + "go.uber.org/zap" +) + +const ( + pathInfiniband = "/sys/class/infiniband" + pathDebugStatusParameters = "/sys/class/net" +) + +func NewInfinibandReader() *InfinibandReader { + return &InfinibandReader{ + l: log.Logger().Named(string("InfinibandReader")), + counterStats: make(map[CounterStat]uint64), + statusParamStats: make(map[StatusParam]uint64), + } +} + +type InfinibandReader struct { + l *log.ZapLogger + counterStats map[CounterStat]uint64 + statusParamStats map[StatusParam]uint64 +} + +func (ir *InfinibandReader) readAndUpdate() error { + if err := ir.readCounterStats(pathInfiniband); err != nil { + return err + } + + if err := ir.readStatusParamStats(pathDebugStatusParameters); err != nil { + return err + } + + ir.updateMetrics() + ir.l.Debug("Done reading and updating stats") + + return nil +} + +func (ir *InfinibandReader) readCounterStats(path string) error { + ir.counterStats = make(map[CounterStat]uint64) + devices, err := os.ReadDir(path) + if err != nil { + ir.l.Error("error reading dir:", zap.Error(err)) + return err + } + for _, device := range devices { + portsPath := filepath.Join(path, device.Name(), "ports") + ports, err := os.ReadDir(portsPath) + if err != nil { + ir.l.Error("error reading dir:", zap.Error(err)) + continue + } + for _, port := range ports { + countersPath := filepath.Join(portsPath, port.Name(), "counters") + counters, err := os.ReadDir(countersPath) + if err != nil { + ir.l.Error("error reading dir:", zap.Error(err)) + continue + } + for _, counter := range counters { + counterPath := filepath.Join(countersPath, counter.Name()) + val, err := os.ReadFile(counterPath) + if err != nil { + ir.l.Error("Error while reading infiniband file: \n", zap.Error(err)) + continue + } + num, err := strconv.ParseUint(strings.TrimSpace(string(val)), 10, 64) + if err != nil { + ir.l.Error("error parsing string:", zap.Error(err)) + return err + } + ir.counterStats[CounterStat{Name: counter.Name(), Device: device.Name(), Port: port.Name()}] = num + } + + } + } + return nil +} + +func (ir *InfinibandReader) readStatusParamStats(path string) error { + ifaces, err := os.ReadDir(path) + if err != nil { + ir.l.Error("error reading dir:", zap.Error(err)) + return err + } + ir.statusParamStats = make(map[StatusParam]uint64) + for _, iface := range ifaces { + statusParamsPath := filepath.Join(path, iface.Name(), "debug") + statusParams, err := os.ReadDir(statusParamsPath) + if err != nil { + ir.l.Error("error parsing string:", zap.Error(err)) + continue + } + for _, statusParam := range statusParams { + statusParamPath := filepath.Join(statusParamsPath, statusParam.Name()) + val, err := os.ReadFile(statusParamPath) + if err != nil { + ir.l.Error("Error while reading infiniband path file: \n", zap.Error(err)) + continue + } + num, err := strconv.ParseUint(string(val), 10, 64) + if err != nil { + ir.l.Error("Error while reading infiniband file: \n", zap.Error(err)) + return err + } + ir.statusParamStats[StatusParam{Name: statusParam.Name(), Iface: iface.Name()}] = num + + } + } + return nil +} + +func (ir *InfinibandReader) updateMetrics() { + if ir.counterStats == nil { + ir.l.Info("No stats found") + return + } + if ir.statusParamStats == nil { + ir.l.Info("No status param stats found") + return + } + + // Adding counter stats + for counter, val := range ir.counterStats { + metrics.InfinibandCounterStats.WithLabelValues(counter.Name, counter.Device, counter.Port).Set(float64(val)) + } + + // Adding status params + for statusParam, val := range ir.statusParamStats { + metrics.InfinibandStatusParams.WithLabelValues(statusParam.Name, statusParam.Iface).Set(float64(val)) + } +} diff --git a/pkg/plugin/infiniband/infiniband_stats_linux_test.go b/pkg/plugin/infiniband/infiniband_stats_linux_test.go new file mode 100644 index 00000000000..32d8386c0a5 --- /dev/null +++ b/pkg/plugin/infiniband/infiniband_stats_linux_test.go @@ -0,0 +1,134 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +package infiniband + +import ( + "testing" + + "github.com/microsoft/retina/pkg/log" + "github.com/microsoft/retina/pkg/metrics" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + gomock "go.uber.org/mock/gomock" +) + +var ( + MockGaugeVec *metrics.MockIGaugeVec + MockCounterVec *metrics.MockICounterVec +) + +func TestNewInfinibandReader(t *testing.T) { + log.SetupZapLogger(log.GetDefaultLogOpts()) + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + nr := NewInfinibandReader() + assert.NotNil(t, nr) +} + +func InitalizeMetricsForTesting(ctrl *gomock.Controller) { + metricsLogger := log.Logger().Named("metrics") + metricsLogger.Info("Initializing metrics for testing") + + MockGaugeVec = metrics.NewMockIGaugeVec(ctrl) + metrics.InfinibandCounterStats = MockGaugeVec //nolint:typecheck + metrics.InfinibandStatusParams = MockGaugeVec +} + +//nolint:testifylint // not making linter changes to preserve exact behavior +func TestReadCounterStats(t *testing.T) { + log.SetupZapLogger(log.GetDefaultLogOpts()) + tests := []struct { + name string + filePath string + result *CounterStat + wantErr bool + }{ + { + name: "test correct", + filePath: "testdata/infiniband", + wantErr: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + nr := NewInfinibandReader() + InitalizeMetricsForTesting(ctrl) + + testmetric := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "testmetric", + Help: "testmetric", + }) + + MockGaugeVec.EXPECT().WithLabelValues(gomock.Any()).Return(testmetric).AnyTimes() + + assert.NotNil(t, nr) + err := nr.readCounterStats(tt.filePath) + if tt.wantErr { + assert.NotNil(t, err, "Expected error but got nil") + } else { + assert.Nil(t, err, "Expected nil but got err") + assert.NotNil(t, nr.counterStats, "Expected data got nil") + for _, val := range nr.counterStats { + assert.Equal(t, val, uint64(1)) + } + assert.Equal(t, len(nr.counterStats), 6, "Read values are not equal to expected") + nr.updateMetrics() + } + }) + } +} + +func TestReadStatusParamStats(t *testing.T) { + log.SetupZapLogger(log.GetDefaultLogOpts()) + tests := []struct { + name string + filePath string + result *StatusParam + wantErr bool + }{ + { + name: "test correct", + filePath: "testdata/net", + wantErr: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + nr := NewInfinibandReader() + InitalizeMetricsForTesting(ctrl) + + testmetric := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "testmetric", + Help: "testmetric", + }) + + MockGaugeVec.EXPECT().WithLabelValues(gomock.Any()).Return(testmetric).AnyTimes() + + assert.NotNil(t, nr) + err := nr.readStatusParamStats(tt.filePath) + if tt.wantErr { + assert.NotNil(t, err, "Expected error but got nil") + } else { + assert.Nil(t, err, "Expected nil but got err") + assert.NotNil(t, nr.statusParamStats, "Expected data got nil") + for _, val := range nr.statusParamStats { + assert.Equal(t, val, uint64(1)) + } + assert.Equal(t, len(nr.statusParamStats), 4, "Read values are not equal to expected") + + nr.updateMetrics() + } + }) + } +} diff --git a/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/1/counters/VL15_dropped b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/1/counters/VL15_dropped new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/1/counters/VL15_dropped @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/1/counters/excessive_buffer_overrun_errors b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/1/counters/excessive_buffer_overrun_errors new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/1/counters/excessive_buffer_overrun_errors @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/2/counters/VL15_dropped b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/2/counters/VL15_dropped new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/2/counters/VL15_dropped @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/2/counters/excessive_buffer_overrun_errors b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/2/counters/excessive_buffer_overrun_errors new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/infiniband/mlx5_an0/ports/2/counters/excessive_buffer_overrun_errors @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/infiniband/mlx5_ib0/ports/1/counters/VL15_dropped b/pkg/plugin/infiniband/testdata/infiniband/mlx5_ib0/ports/1/counters/VL15_dropped new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/infiniband/mlx5_ib0/ports/1/counters/VL15_dropped @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/infiniband/mlx5_ib0/ports/1/counters/excessive_buffer_overrun_errors b/pkg/plugin/infiniband/testdata/infiniband/mlx5_ib0/ports/1/counters/excessive_buffer_overrun_errors new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/infiniband/mlx5_ib0/ports/1/counters/excessive_buffer_overrun_errors @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/net/docker0/debug/link_down_reason b/pkg/plugin/infiniband/testdata/net/docker0/debug/link_down_reason new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/net/docker0/debug/link_down_reason @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/net/docker0/debug/lro_timeout b/pkg/plugin/infiniband/testdata/net/docker0/debug/lro_timeout new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/net/docker0/debug/lro_timeout @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/net/ib0/debug/link_down_reason b/pkg/plugin/infiniband/testdata/net/ib0/debug/link_down_reason new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/net/ib0/debug/link_down_reason @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/testdata/net/ib0/debug/lro_timeout b/pkg/plugin/infiniband/testdata/net/ib0/debug/lro_timeout new file mode 100644 index 00000000000..56a6051ca2b --- /dev/null +++ b/pkg/plugin/infiniband/testdata/net/ib0/debug/lro_timeout @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/pkg/plugin/infiniband/types_linux.go b/pkg/plugin/infiniband/types_linux.go new file mode 100644 index 00000000000..f71caceee0a --- /dev/null +++ b/pkg/plugin/infiniband/types_linux.go @@ -0,0 +1,132 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +package infiniband + +import ( + "github.com/cakturk/go-netstat/netstat" + kcfg "github.com/microsoft/retina/pkg/config" + "github.com/microsoft/retina/pkg/log" + "github.com/microsoft/retina/pkg/plugin/api" +) + +const ( + Name api.PluginName = "infiniband" +) + +//go:generate go run go.uber.org/mock/mockgen@v0.4.0 -source=types_linux.go -destination=infiniband_mock_generated.go -package=infiniband +type infiniband struct { + cfg *kcfg.Config + l *log.ZapLogger + isRunning bool +} + +var netstatCuratedKeys = map[string]struct{}{ + "ListenDrops": {}, + "LockDroppedIcmps": {}, + "PFMemallocDrop": {}, + "TCPBacklogDrop": {}, + "TCPDeferAcceptDrop": {}, + "TCPMinTTLDrop": {}, + "TCPRcvQDrop": {}, + "TCPReqQFullDrop": {}, + "TCPZeroWindowDrop": {}, + "InCsumErrors": {}, + "DataCsumErr": {}, + "AddAddrDrop": {}, + "RmAddrDrop": {}, +} + +type ConnectionStats struct { + // https://github.com/ecki/net-tools/blob/master/statistics.c#L206 + TcpExt map[string]uint64 `json:"tcp_ext"` + IpExt map[string]uint64 `json:"ip_ext"` + MPTcpExt map[string]uint64 `json:"mptcp_ext"` + // Socket stats + UdpSockets SocketStats `json:"udp_sockets"` + TcpSockets SocketStats `json:"tcp_sockets"` +} + +type IfaceStats struct { + Name string + // Inbound stats + RxBytes uint64 + RxPackets uint64 + RxErrs uint64 + RxDrop uint64 + RxFIFO uint64 + RxFrame uint64 + RxCompressed uint64 + RxMulticast uint64 + // Outbound stats + TxBytes uint64 + TxPackets uint64 + TxErrs uint64 + TxDrop uint64 + TxFIFO uint64 + TxColls uint64 + TxCarrier uint64 + TxCompressed uint64 +} + +type CounterStat struct { + Name string + Device string + Port string +} + +type StatusParam struct { + Name string + Iface string +} + +type SocketStats struct { + totalActiveSockets int + // count of sockets opened by state + socketByState map[string]int + // count of sockets opened by remote address + socketByRemoteAddr map[string]int +} + +type NetstatOpts struct { + // when true only includes curated list of keys + CuratedKeys bool + + // when true will include all keys with value 0 + AddZeroVal bool + + // get only listening sockets + ListenSock bool +} + +type EthtoolStats struct { + // Stats by interface name and stat name + stats map[string]map[string]uint64 +} + +type EthtoolOpts struct { + // when true will only include keys with err or drop in its name + errOrDropKeysOnly bool + + // when true will include all keys with value 0 + addZeroVal bool +} + +type EthtoolInterface interface { + Stats(intf string) (map[string]uint64, error) + Close() +} + +type NetstatInterface interface { + UDPSocks(accept netstat.AcceptFn) ([]netstat.SockTabEntry, error) + TCPSocks(accept netstat.AcceptFn) ([]netstat.SockTabEntry, error) +} + +type Netstat struct{} + +func (n *Netstat) UDPSocks(accept netstat.AcceptFn) ([]netstat.SockTabEntry, error) { + return netstat.UDPSocks(accept) +} + +func (n *Netstat) TCPSocks(accept netstat.AcceptFn) ([]netstat.SockTabEntry, error) { + return netstat.TCPSocks(accept) +} diff --git a/pkg/plugin/packetparser/packetparser_bpfel_arm64.o b/pkg/plugin/packetparser/packetparser_bpfel_arm64.o deleted file mode 100644 index 0a8a67e52d0..00000000000 Binary files a/pkg/plugin/packetparser/packetparser_bpfel_arm64.o and /dev/null differ diff --git a/pkg/plugin/packetparser/packetparser_bpfel_x86.o b/pkg/plugin/packetparser/packetparser_bpfel_x86.o deleted file mode 100644 index d57b8375704..00000000000 Binary files a/pkg/plugin/packetparser/packetparser_bpfel_x86.o and /dev/null differ diff --git a/pkg/plugin/registry/registry_linux.go b/pkg/plugin/registry/registry_linux.go index 6918a6fe353..cbe7ed5f33e 100644 --- a/pkg/plugin/registry/registry_linux.go +++ b/pkg/plugin/registry/registry_linux.go @@ -11,6 +11,7 @@ import ( "github.com/microsoft/retina/pkg/plugin/dns" "github.com/microsoft/retina/pkg/plugin/dropreason" "github.com/microsoft/retina/pkg/plugin/linuxutil" + "github.com/microsoft/retina/pkg/plugin/infiniband" "github.com/microsoft/retina/pkg/plugin/mockplugin" "github.com/microsoft/retina/pkg/plugin/packetforward" "github.com/microsoft/retina/pkg/plugin/packetparser" @@ -26,6 +27,7 @@ func RegisterPlugins() { PluginHandler[dropreason.Name] = dropreason.New PluginHandler[packetforward.Name] = packetforward.New PluginHandler[linuxutil.Name] = linuxutil.New + PluginHandler[infiniband.Name] = infiniband.New PluginHandler[packetparser.Name] = packetparser.New PluginHandler[dns.Name] = dns.New PluginHandler[tcpretrans.Name] = tcpretrans.New diff --git a/pkg/utils/attr_utils.go b/pkg/utils/attr_utils.go index c0c47e95743..4776d42e76a 100644 --- a/pkg/utils/attr_utils.go +++ b/pkg/utils/attr_utils.go @@ -48,6 +48,7 @@ var ( Endpoint = "endpoint" AclRule = "aclrule" Active = "ACTIVE" + Device = "device" // TCP Connection Statistic Names ResetCount = "ResetCount" diff --git a/pkg/utils/metric_names.go b/pkg/utils/metric_names.go index b672dd20872..3db7daaa3e9 100644 --- a/pkg/utils/metric_names.go +++ b/pkg/utils/metric_names.go @@ -29,6 +29,8 @@ const ( NodeApiServerLatencyName = "node_apiserver_latency" NodeApiServerTcpHandshakeLatencyName = "node_apiserver_handshake_latency" NoResponseFromApiServerName = "node_apiserver_no_response" + InfinibandCounterStatsName = "infiniband_counter_stats" + InfinibandStatusParamsName = "infiniband_status_params" // Common Gauges across os distributions NodeConnectivityStatusName = "node_connectivity_status" diff --git a/plugin.sh b/plugin.sh new file mode 100644 index 00000000000..c9446af326a --- /dev/null +++ b/plugin.sh @@ -0,0 +1,9 @@ +CNI_PLUGIN_VERSION="v1.4.0" +CNI_PLUGIN_TAR="cni-plugins-linux-amd64-$CNI_PLUGIN_VERSION.tgz" # change arch if not on amd64 +CNI_PLUGIN_INSTALL_DIR="/opt/cni/bin" + +#https://github.com/containernetworking/plugins/releases/download/v1.4.0/cni-plugins-linux-amd64-v1.4.0.tgz +curl -LO "https://github.com/containernetworking/plugins/releases/download/$CNI_PLUGIN_VERSION/$CNI_PLUGIN_TAR" +sudo mkdir -p "$CNI_PLUGIN_INSTALL_DIR" +sudo tar -xf "$CNI_PLUGIN_TAR" -C "$CNI_PLUGIN_INSTALL_DIR" +rm "$CNI_PLUGIN_TAR" diff --git a/test-summary b/test-summary new file mode 100755 index 00000000000..a048cba9523 Binary files /dev/null and b/test-summary differ diff --git a/test/plugin/infiniband/main_linux.go b/test/plugin/infiniband/main_linux.go new file mode 100644 index 00000000000..4b307a87ddb --- /dev/null +++ b/test/plugin/infiniband/main_linux.go @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +package main + +import ( + "context" + "time" + + kcfg "github.com/microsoft/retina/pkg/config" + + "github.com/microsoft/retina/pkg/log" + "github.com/microsoft/retina/pkg/metrics" + "github.com/microsoft/retina/pkg/plugin/infiniband" + + "go.uber.org/zap" +) + +func main() { + log.SetupZapLogger(log.GetDefaultLogOpts()) + l := log.Logger().Named("test-infiniband") + + metrics.InitializeMetrics() + + cfg := &kcfg.Config{ + MetricsInterval: 1 * time.Second, + EnablePodLevel: true, + } + tt := infiniband.New(cfg) + err := tt.Init() + if err != nil { + l.Error("Init failed:%v", zap.Error(err)) + return + } + ctx := context.Background() + err = tt.Start(ctx) + if err != nil { + l.Error("start failed:%v", zap.Error(err)) + return + } + l.Info("started infiniband logger") + + defer func() { + err := tt.Stop() + if err != nil { + l.Error("stop failed:%v", zap.Error(err)) + } + }() + + for range ctx.Done() { + } +}