Skip to content

Commit

Permalink
Integrate single IB network with MNNVL topology
Browse files Browse the repository at this point in the history
Signed-off-by: Ritika Srivastava <[email protected]>
  • Loading branch information
ritikasrivastava authored and dmitsh committed Oct 22, 2024
1 parent 7113f00 commit 9bd0e53
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 26 deletions.
44 changes: 33 additions & 11 deletions pkg/providers/baremetal/mnnvl.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import (
"bufio"
"context"
"fmt"
"strconv"
"strings"

"github.com/NVIDIA/topograph/pkg/common"
"github.com/NVIDIA/topograph/pkg/ib"
"github.com/NVIDIA/topograph/pkg/utils"
)

Expand Down Expand Up @@ -36,6 +36,20 @@ func domainIDExists(id string, domainMap map[string]domain) bool {
return false
}

func getIbOutput(ctx context.Context, nodes []string) ([]byte, error) {
for _, node := range nodes {
args := []string{"-N", "-R", "ssh", "-w", node, "sudo ibnetdiscover"}
stdout, err := utils.Exec(ctx, "pdsh", args, nil)
if err != nil {
return nil, fmt.Errorf("Exec error while pdsh IB command\n")
}
if strings.Contains(stdout.String(), "Topology file:") {
return stdout.Bytes(), err
}
}
return nil, fmt.Errorf("No IB network found\n")
}

// getClusterOutput reads output from nodeInfo and populates the structs
func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes []string, cmd string) error {
args := []string{"-R", "ssh", "-w", strings.Join(nodes, ","), cmd}
Expand Down Expand Up @@ -63,31 +77,30 @@ func getClusterOutput(ctx context.Context, domainMap map[string]domain, nodes []
}
return nil
}
func toGraph(domainMap map[string]domain) *common.Vertex {
func toGraph(domainMap map[string]domain, treeRoot *common.Vertex) *common.Vertex {
root := &common.Vertex{
Vertices: make(map[string]*common.Vertex),
Metadata: make(map[string]string),
}
blockSize := -1
blockRoot := &common.Vertex{
Vertices: make(map[string]*common.Vertex),
Metadata: make(map[string]string),
}
root.Vertices[common.ValTopologyTree] = treeRoot
for domainName, domain := range domainMap {
tree := &common.Vertex{
ID: domainName,
Vertices: make(map[string]*common.Vertex),
}
for node := range domain.nodeMap {
tree.Vertices[node] = &common.Vertex{Name: node, ID: node}
if blockSize == -1 {
blockSize = len(domain.nodeMap)
} else {
fmt.Printf("blockSize different between NVL domains")
}
}
root.Vertices[domainName] = tree
blockRoot.Vertices[domainName] = tree
}
// add root metadata
root.Metadata[common.KeyEngine] = common.EngineSLURM
root.Metadata[common.KeyPlugin] = common.ValTopologyBlock
root.Metadata[common.KeyBlockSizes] = strconv.Itoa(blockSize)
root.Vertices[common.ValTopologyBlock] = blockRoot
return root
}

Expand All @@ -98,5 +111,14 @@ func generateTopologyConfig(ctx context.Context, cis []common.ComputeInstances)
if err != nil {
return nil, fmt.Errorf("getClusterOutput failed: %v\n", err)
}
return toGraph(domainMap), nil
// get ibnetdiscover output from 1st node
ibnetdiscoverOutput, err := getIbOutput(ctx, nodes)
if err != nil {
return nil, fmt.Errorf("getIbOutput failed: %v\n", err)
}
treeRoot, err := ib.GenerateTopologyConfig(ibnetdiscoverOutput)
if err != nil {
return nil, fmt.Errorf("IB GenerateTopologyConfig failed: %v\n", err)
}
return toGraph(domainMap, treeRoot), nil
}
148 changes: 133 additions & 15 deletions pkg/translate/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,31 +28,92 @@ import (

func ToSLURM(wr io.Writer, root *common.Vertex) error {
if len(root.Metadata) != 0 && root.Metadata[common.KeyPlugin] == common.ValTopologyBlock {
return toBlockSLURM(wr, root, root.Metadata[common.KeyBlockSizes])
return toBlockSLURM(wr, root)
}
return toTreeSLURM(wr, root)
}

func toBlockSLURM(wr io.Writer, root *common.Vertex, blocksizes string) error {
// sort the IDs
keys := make([]string, 0, len(root.Vertices))
for key := range root.Vertices {
keys = append(keys, key)
}
sort.Strings(keys)

for _, key := range keys {
block := root.Vertices[key]
func printBlock(wr io.Writer, block *common.Vertex, domainVisited map[string]int) error {
if _, exists := domainVisited[block.ID]; !exists {
nodes := make([]string, 0, len(block.Vertices))
for _, node := range block.Vertices {
for _, node := range block.Vertices { //nodes within each domain
nodes = append(nodes, node.Name)
}
_, err := wr.Write([]byte(fmt.Sprintf("BlockName=%s Nodes=%s\n", block.ID, strings.Join(compress(nodes), ","))))
if err != nil {
return err
}
domainVisited[block.ID] = len(nodes)
}
return nil
}

func findBlock(wr io.Writer, nodename string, root *common.Vertex, domainVisited map[string]int) error { // blockRoot
for _, block := range root.Vertices {
if _, exists := block.Vertices[nodename]; exists {
return printBlock(wr, block, domainVisited)
}
}
return nil
}

func printDisconnectedBlocks(wr io.Writer, root *common.Vertex, domainVisited map[string]int) error {
for _, block := range root.Vertices {
err := printBlock(wr, block, domainVisited)
if err != nil {
return err
}
}
return nil
}

func verifyBlockSize(domainVisited map[string]int) int {
blockSize := -1
for _, bSize := range domainVisited {
if blockSize == -1 {
blockSize = bSize
} else if blockSize != bSize {
fmt.Printf("Alert! blockSize different between NVL domains")
}
}
return blockSize
}

func toBlockSLURM(wr io.Writer, root *common.Vertex) error {
// traverse tree topology and when a node is reached, check within blockRoot for domain and print that domain.
// keep a map of which domain has been printed
treeRoot := root.Vertices[common.ValTopologyTree]
blockRoot := root.Vertices[common.ValTopologyBlock]
visited := make(map[string]bool)
queue := []*common.Vertex{treeRoot}
domainVisited := make(map[string]int)

if treeRoot != nil {
for len(queue) > 0 {
v := queue[0]
queue = queue[1:]
for _, w := range v.Vertices {
if len(w.Vertices) == 0 { // it's a leaf; don't add to queue
err := findBlock(wr, w.Name, blockRoot, domainVisited)
if err != nil {
return err
}
} else if !visited[w.ID] {
queue = append(queue, w)
visited[w.ID] = true
}
}
}
}
_, err := wr.Write([]byte(fmt.Sprintf("BlockSizes=%s\n", blocksizes)))
err := printDisconnectedBlocks(wr, blockRoot, domainVisited)
if err != nil {
return err
}
blockSize := strconv.Itoa(verifyBlockSize(domainVisited))
if _, exists := root.Metadata[common.KeyBlockSizes]; exists {
blockSize = root.Metadata[common.KeyBlockSizes]
}
_, err = wr.Write([]byte(fmt.Sprintf("BlockSizes=%s\n", blockSize)))
return err
}

Expand Down Expand Up @@ -284,7 +345,7 @@ func GetTreeTestSet(testForLongLabelName bool) (*common.Vertex, map[string]strin
return root, instance2node
}

func GetBlockTestSet() (*common.Vertex, map[string]string) {
func GetBlockWithIBTestSet() (*common.Vertex, map[string]string) {
instance2node := map[string]string{
"I14": "Node104", "I15": "Node105", "I16": "Node106",
"I21": "Node201", "I22": "Node202", "I25": "Node205",
Expand All @@ -298,6 +359,22 @@ func GetBlockTestSet() (*common.Vertex, map[string]string) {
n22 := &common.Vertex{ID: "I22", Name: "Node202"}
n25 := &common.Vertex{ID: "I25", Name: "Node205"}

sw2 := &common.Vertex{
ID: "S2",
Vertices: map[string]*common.Vertex{"I14": n14, "I15": n15, "I16": n16},
}
sw3 := &common.Vertex{
ID: "S3",
Vertices: map[string]*common.Vertex{"I21": n21, "I22": n22, "I25": n25},
}
sw1 := &common.Vertex{
ID: "S1",
Vertices: map[string]*common.Vertex{"S2": sw2, "S3": sw3},
}
treeRoot := &common.Vertex{
Vertices: map[string]*common.Vertex{"S1": sw1},
}

block1 := &common.Vertex{
ID: "B1",
Vertices: map[string]*common.Vertex{"I14": n14, "I15": n15, "I16": n16},
Expand All @@ -307,14 +384,55 @@ func GetBlockTestSet() (*common.Vertex, map[string]string) {
Vertices: map[string]*common.Vertex{"I21": n21, "I22": n22, "I25": n25},
}

root := &common.Vertex{
blockRoot := &common.Vertex{
Vertices: map[string]*common.Vertex{"B1": block1, "B2": block2},
}

root := &common.Vertex{
Vertices: map[string]*common.Vertex{common.ValTopologyBlock: blockRoot, common.ValTopologyTree: treeRoot},
Metadata: map[string]string{
common.KeyEngine: common.EngineSLURM,
common.KeyPlugin: common.ValTopologyBlock,
common.KeyBlockSizes: "8",
},
}
return root, instance2node
}

func GetBlockTestSet() (*common.Vertex, map[string]string) {
instance2node := map[string]string{
"I14": "Node104", "I15": "Node105", "I16": "Node106",
"I21": "Node201", "I22": "Node202", "I25": "Node205",
}

n14 := &common.Vertex{ID: "I14", Name: "Node104"}
n15 := &common.Vertex{ID: "I15", Name: "Node105"}
n16 := &common.Vertex{ID: "I16", Name: "Node106"}

n21 := &common.Vertex{ID: "I21", Name: "Node201"}
n22 := &common.Vertex{ID: "I22", Name: "Node202"}
n25 := &common.Vertex{ID: "I25", Name: "Node205"}

block1 := &common.Vertex{
ID: "B1",
Vertices: map[string]*common.Vertex{"I14": n14, "I15": n15, "I16": n16},
}
block2 := &common.Vertex{
ID: "B2",
Vertices: map[string]*common.Vertex{"I21": n21, "I22": n22, "I25": n25},
}

blockRoot := &common.Vertex{
Vertices: map[string]*common.Vertex{"B1": block1, "B2": block2},
}

root := &common.Vertex{
Vertices: map[string]*common.Vertex{common.ValTopologyBlock: blockRoot},
Metadata: map[string]string{
common.KeyEngine: common.EngineSLURM,
common.KeyPlugin: common.ValTopologyBlock,
common.KeyBlockSizes: "8",
},
}
return root, instance2node
}
8 changes: 8 additions & 0 deletions pkg/translate/output_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ func TestToBlockSLURM(t *testing.T) {
require.Equal(t, testBlockConfig, buf.String())
}

func TestToBlockIBSLURM(t *testing.T) {
v, _ := GetBlockWithIBTestSet()
buf := &bytes.Buffer{}
err := ToSLURM(buf, v)
require.NoError(t, err)
require.Equal(t, testBlockConfig, buf.String())
}

func TestToSlurmNameShortener(t *testing.T) {
v := &common.Vertex{
Vertices: map[string]*common.Vertex{
Expand Down

0 comments on commit 9bd0e53

Please sign in to comment.