Include the ib_burn scripts.

This commit is contained in:
Bartosz Wróblewski
2024-06-25 09:34:41 -07:00
parent c314f3330c
commit dcc1fff942
5 changed files with 348 additions and 0 deletions

View File

@@ -7,3 +7,4 @@ The code is organized as follows:
- `health_checks` contains various checks we use to determine which hosts are healthy, as well as automated solutions to common issues.
- `host_validation` contains tests to check that the GPUs on a given machine are able to communicate with each other (via NVLink) and with GPUs on other machines (via InfiniBand).
- `ufm_events` contains a script which parses the UFM event log and other logs, checks for relevant events, and determines which network ports should be disabled.
- `ib_burn` contains a script for generating a comprehensive burn-in workload for IB fabrics, aiming to exercise every available link.

29
ib_burn/config.json Normal file
View File

@@ -0,0 +1,29 @@
{
"node_info": {
"nodes": {
"hostname-23r66x3": "10.1.2.3",
"hostname-32ls6x3": "10.1.2.4",
"hostname-3lvy7y3": "10.1.2.5"
},
"user": "root",
"port": 22
},
"ib_hcas": {
"mlx5_0": "ibp26s0",
"mlx5_3": "ibp60s0",
"mlx5_4": "ibp77s0",
"mlx5_5": "ibp94s0",
"mlx5_6": "ibp156s0",
"mlx5_9": "ibp188s0",
"mlx5_10": "ibp204s0",
"mlx5_11": "ibp220s0"
},
"ib_burn": {
"min_link_throughput": 0.50,
"max_link_throughput": 1.00,
"parallel_stage_size": 1024,
"parallel_stage_time": 1800,
"server_startup_time": 15,
"writer_startup_time": 1
}
}

281
ib_burn/ib_burn.py Executable file
View File

@@ -0,0 +1,281 @@
#!/usr/bin/env python3
"""
Generates a comprehensive IB fabric burn-in workload (ib_burn.sh).
Gathers the necessary information from config.json and files generated by ./ib_fabric.sh (run it first!).
We assume that if mutiple shortest paths exist between two switches, the traffic is evenly distributed among them.
That assumption holds for IB fabrics with AR (Adaptive Routing) enabled, but your mileage may vary.
This script can't be directly used with ethernet / RoCE fabrics, but the general logic should work.
You'll want to garther traffic counters before and after the burn-in to verify the results.
"""
import collections
import itertools
import json
import random
import networkx
import numpy
CONFIG = json.load(open("config.json"))
MIN_LINK_BW = CONFIG["ib_burn"]["min_link_throughput"]
MAX_LINK_BW = CONFIG["ib_burn"]["max_link_throughput"]
STAGE_SIZE = CONFIG["ib_burn"]["parallel_stage_size"]
PARALLEL_STAGE_TIME = CONFIG["ib_burn"]["parallel_stage_time"]
SERVER_STARTUP_TIME = CONFIG["ib_burn"]["server_startup_time"]
WRITER_STARTUP_TIME = CONFIG["ib_burn"]["writer_startup_time"]
STAGE_INNER_TIMEOUT = PARALLEL_STAGE_TIME + SERVER_STARTUP_TIME + WRITER_STARTUP_TIME + 10
GUID_TO_HOST_IB_HCA = {}
GUID_TO_SWITCH_NAME = {}
SWITCH_TO_SWITCH_LINKS = collections.Counter()
LEAF_SWITCH_TO_HOST_IB = collections.defaultdict(list)
for line in open("ib_server.list"):
guid, host, hca = line.split()
GUID_TO_HOST_IB_HCA[guid] = (host, hca)
for line in open("ib_switch.list"):
guid, name = line.split()
GUID_TO_SWITCH_NAME[guid] = name
for line in open("ib_switch.link"):
guid1, guid2, port1, port2 = line.split()
if guid1 in GUID_TO_HOST_IB_HCA:
host, hca = GUID_TO_HOST_IB_HCA[guid1]
LEAF_SWITCH_TO_HOST_IB[guid2].append((host, hca))
if guid2 in GUID_TO_HOST_IB_HCA:
host, hca = GUID_TO_HOST_IB_HCA[guid2]
LEAF_SWITCH_TO_HOST_IB[guid1].append((host, hca))
if guid1 not in GUID_TO_SWITCH_NAME:
continue
if guid2 not in GUID_TO_SWITCH_NAME:
continue
SWITCH_TO_SWITCH_LINKS[guid1, guid2] += 1
GRAPH = networkx.Graph()
for switch1, switch2 in SWITCH_TO_SWITCH_LINKS:
name1 = GUID_TO_SWITCH_NAME[switch1]
name2 = GUID_TO_SWITCH_NAME[switch2]
GRAPH.add_edge(switch1, switch2)
GUID_TO_SWITCH_NAME[None] = "SWITCH-RAIL-HOSTS"
for guid in list(LEAF_SWITCH_TO_HOST_IB.keys()):
SWITCH_TO_SWITCH_LINKS[None, guid] = len(LEAF_SWITCH_TO_HOST_IB[guid])
SWITCH_TO_SWITCH_LINKS[guid, None] = len(LEAF_SWITCH_TO_HOST_IB[guid])
GUID_TO_SWITCH_INDEX = {None: 0}
for guid in LEAF_SWITCH_TO_HOST_IB:
GUID_TO_SWITCH_INDEX.setdefault(guid, len(GUID_TO_SWITCH_INDEX))
for guid in GUID_TO_SWITCH_NAME:
GUID_TO_SWITCH_INDEX.setdefault(guid, len(GUID_TO_SWITCH_INDEX))
SWITCH_INDEX_TO_GUID = {v: k for k, v in GUID_TO_SWITCH_INDEX.items()}
BURN_SOURCE_GUIDS = list(LEAF_SWITCH_TO_HOST_IB)
BURN_TARGET_GUIDS = list(LEAF_SWITCH_TO_HOST_IB)
THROUGHPUT_SHAPE = (
len(BURN_SOURCE_GUIDS) + 1,
len(BURN_TARGET_GUIDS) + 1,
len(GUID_TO_SWITCH_INDEX),
len(GUID_TO_SWITCH_INDEX),
)
if False:
THROUGHPUT_MMAP = numpy.memmap("ib_routes.mmap", dtype=numpy.float64, mode='w+', shape=THROUGHPUT_SHAPE)
for source_guid, target_guid in itertools.product(BURN_SOURCE_GUIDS, BURN_TARGET_GUIDS):
if source_guid == target_guid:
continue
route_list = list(networkx.all_shortest_paths(GRAPH, source_guid, target_guid))
route_cost = 1.0 / len(route_list)
link_bw = collections.Counter()
for path in route_list:
guid0 = path[0]
link_bw[None, guid0] += route_cost
for guid1, guid2 in zip(path[:-1], path[1:]):
link_bw[guid1, guid2] += route_cost
link_bw[guid2, None] += route_cost
link_bw_shape = (
len(GUID_TO_SWITCH_INDEX),
len(GUID_TO_SWITCH_INDEX),
)
link_bw_array = numpy.zeros(link_bw_shape, dtype=numpy.float64)
for link, bandwidth in link_bw.items():
guid1 = link[0]
guid2 = link[1]
gidx1 = GUID_TO_SWITCH_INDEX[guid1]
gidx2 = GUID_TO_SWITCH_INDEX[guid2]
total_badwidth = SWITCH_TO_SWITCH_LINKS[guid1,guid2]
used_bandwidth = bandwidth / total_badwidth
link_bw_array[gidx1, gidx2] = used_bandwidth
source_idx = GUID_TO_SWITCH_INDEX[source_guid]
target_idx = GUID_TO_SWITCH_INDEX[target_guid]
THROUGHPUT_MMAP[source_idx, target_idx] = link_bw_array
THROUGHPUT_MMAP.flush()
else:
THROUGHPUT_MMAP = numpy.memmap("ib_routes.mmap", dtype=numpy.float64, mode='r', shape=THROUGHPUT_SHAPE)
BURN_SOURCE_INDEXES = list(range(1, len(LEAF_SWITCH_TO_HOST_IB) + 1))
BURN_TARGET_INDEXES = list(range(1, len(LEAF_SWITCH_TO_HOST_IB) + 1))
POSSIBLE_ROUTES = list(itertools.product(BURN_SOURCE_INDEXES, BURN_TARGET_INDEXES))
SELECTED_ROUTES = []
NEEDED_SHAPE = (
len(GUID_TO_SWITCH_INDEX),
len(GUID_TO_SWITCH_INDEX),
)
NEEDED_LINKS = numpy.zeros(NEEDED_SHAPE, dtype=numpy.uint8)
for guid1, guid2 in SWITCH_TO_SWITCH_LINKS.keys():
if guid1 is None:
continue
if guid2 is None:
continue
i = GUID_TO_SWITCH_INDEX[guid1]
j = GUID_TO_SWITCH_INDEX[guid2]
NEEDED_LINKS[i,j] = True
print(NEEDED_LINKS.sum())
for stage in range(128):
random.shuffle(POSSIBLE_ROUTES)
stage_bw = numpy.zeros(NEEDED_SHAPE, dtype=THROUGHPUT_MMAP.dtype)
stage_paths = []
stage_burns = 0
for source_idx, target_idx in POSSIBLE_ROUTES:
if source_idx == target_idx:
continue
possible_new_links = (NEEDED_LINKS & (THROUGHPUT_MMAP[source_idx, target_idx] > 0)).sum()
if possible_new_links == 0:
continue
source_guid = SWITCH_INDEX_TO_GUID[source_idx]
target_guid = SWITCH_INDEX_TO_GUID[target_idx]
max_servers = max(
len(LEAF_SWITCH_TO_HOST_IB[source_guid]),
len(LEAF_SWITCH_TO_HOST_IB[target_guid]),
)
route_bw = THROUGHPUT_MMAP[source_idx, target_idx]
num_servers = 0
for k in range(1, max_servers + 1):
planned_link_bw = stage_bw + route_bw * k
if (planned_link_bw.max() < MAX_LINK_BW):
num_servers = k
else:
break
if (num_servers > 0):
stage_paths.append((source_guid, target_guid, num_servers))
stage_bw = stage_bw + route_bw * num_servers
stage_burns += num_servers
if (stage_burns > STAGE_SIZE):
break
burned_links = (stage_bw >= MIN_LINK_BW)
burned_first = (burned_links & NEEDED_LINKS)
NEEDED_LINKS[burned_links] = 0
print(NEEDED_LINKS.sum(), burned_first.sum())
if burned_first.any():
SELECTED_ROUTES.append(stage_paths)
if (NEEDED_LINKS.sum() == 0):
break
SCRIPT = [
"#!/bin/bash",
"set -e",
"mkdir -p ./ib_burn_logs",
]
STAGES = []
for stage, stage_paths in enumerate(SELECTED_ROUTES):
chunk_count = len(stage_paths)
stage_chunk = []
for source_guid, target_guid, num_servers in stage_paths:
source_host_hcas = LEAF_SWITCH_TO_HOST_IB[source_guid]
target_host_hcas = LEAF_SWITCH_TO_HOST_IB[target_guid]
paired_host_hcas = list(zip(
random.sample(source_host_hcas, num_servers),
random.sample(target_host_hcas, num_servers),
))
stage_chunk.append(paired_host_hcas)
for chunk, paired_host_hcas in enumerate(stage_chunk):
SCRIPT.append(f"echo stage {stage}; chunk {chunk} / {chunk_count}; servers")
for (source_host, source_hca), (target_host, target_hca) in paired_host_hcas:
server_port = list(CONFIG["ib_hcas"]).index(target_hca) + 40_000
target_addr = CONFIG["node_info"]["nodes"].get(target_host, target_host)
target_user = CONFIG["node_info"]["user"]
SCRIPT.append(" ".join((
"ssh",
f"{target_user}@{target_addr}",
"timeout {STAGE_INNER_TIMEOUT}",
"stdbuf -oL -eL",
"ib_write_bw",
"--CPU-freq",
"--report_gbits",
f"--ib-dev={target_hca}",
f"--duration {PARALLEL_STAGE_TIME}",
f"--port {server_port}",
"&> ./ib_burn_logs/{source_host}-{source_hca}-{target_host}-{target_hca}-server.log",
"&",
)))
SCRIPT.append(f"sleep {SERVER_STARTUP_TIME}")
SCRIPT.append(f"echo stage {stage}; chunk {chunk} / {chunk_count}; writers")
for (source_host, source_hca), (target_host, target_hca) in paired_host_hcas:
server_port = list(CONFIG["ib_hcas"]).index(target_hca) + 40_000
target_addr = CONFIG["node_info"]["nodes"].get(target_host, target_host)
source_addr = CONFIG["node_info"]["nodes"].get(source_host, source_host)
source_user = CONFIG["node_info"]["user"]
SCRIPT.append(" ".join((
"ssh",
f"{source_user}@{source_host}",
"timeout {STAGE_INNER_TIMEOUT}",
"stdbuf -oL -eL",
"ib_write_bw",
"--CPU-freq",
"--report_gbits",
f"--ib-dev={source_hca}",
f"--duration {PARALLEL_STAGE_TIME}",
f"--port {server_port}",
f"{target_host}",
"&> ./ib_burn_logs/{source_host}-{source_hca}-{target_host}-{target_hca}-writer.log",
"&",
)))
SCRIPT.append(f"sleep {WRITER_STARTUP_TIME}")
SCRIPT.append(f"echo stage {stage}; awaiting")
SCRIPT.append("wait")
SCRIPT.append(f"echo stage {stage}; finished")
SCRIPT_FILE = open("ib_burn.sh", "w")
SCRIPT_FILE.write("\n".join(SCRIPT) + "\n")
SCRIPT_FILE.close()

35
ib_burn/ib_fabric.sh Executable file
View File

@@ -0,0 +1,35 @@
#!/bin/bash
set -e
set -u
SOURCE="${BASH_SOURCE[0]}"
CONFIG="./config.json"
IB_SERVER_LIST="./ib_server.list"
IB_SWITCH_LIST="./ib_switch.list"
IB_SWITCH_LINK="./ib_switch.link"
cd $(dirname "$SOURCE")
echo > "$IB_SERVER_LIST"
function read_config {
jq --raw-output --arg ARG "${2:-}" "$1" "$CONFIG"
}
SSH_USER=$(read_config '.node_info.user')
SSH_PORT=$(read_config '.node_indo.port')
for SSH_HOST in $(read_config '.node_info.nodes | keys[]')
do
SSH_ADDR=$(read_config '.node_info.nodes[$ARG]' "$SSH_HOST")
for IB_HCA in $(read_config '.ib_hcas | keys[]')
do
IB_GUID=$(ssh "$SSH_USER@$SSH_ADDR" "ibstat --short $IB_HCA | grep Node | tr ' ' '\t' | cut -f 4")
echo "$IB_GUID $SSH_HOST $IB_HCA" >> "$IB_SERVER_LIST"
done
done
ssh "$SSH_USER@$SSH_ADDR" 'ibswitches' | tr -d '"' | awk '{ print($3, $6) }' > "$IB_SWITCH_LIST"
ssh "$SSH_USER@$SSH_ADDR" 'iblinkinfo --line --switches-only' | grep 'Active/' \
| sed -E 's/"[^"]*"//g;s/\([^)]+\)//g;s/\[[^]]*\]//g' | awk '{ print($1, $5, $3, $7) }' > "$IB_SWITCH_LINK"

View File

@@ -1,4 +1,6 @@
loguru==0.7.0
networkx==3.1
numpy==1.24.4
tblib==2.0.0
torch==2.3.0
typing_extensions==4.9.0