mirror of
				https://github.com/imbue-ai/cluster-health.git
				synced 2024-06-28 12:52:40 +03:00 
			
		
		
		
	Include the ib_burn scripts.
This commit is contained in:
		| @@ -7,3 +7,4 @@ The code is organized as follows: | ||||
| - `health_checks` contains various checks we use to determine which hosts are healthy, as well as automated solutions to common issues. | ||||
| - `host_validation` contains tests to check that the GPUs on a given machine are able to communicate with each other (via NVLink) and with GPUs on other machines (via InfiniBand). | ||||
| - `ufm_events` contains a script which parses the UFM event log and other logs, checks for relevant events, and determines which network ports should be disabled. | ||||
| - `ib_burn` contains a script for generating a comprehensive burn-in workload for IB fabrics, aiming to exercise every available link. | ||||
|   | ||||
							
								
								
									
										29
									
								
								ib_burn/config.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								ib_burn/config.json
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| { | ||||
|     "node_info": { | ||||
|         "nodes": { | ||||
|             "hostname-23r66x3": "10.1.2.3", | ||||
|             "hostname-32ls6x3": "10.1.2.4", | ||||
|             "hostname-3lvy7y3": "10.1.2.5" | ||||
|         }, | ||||
|         "user": "root", | ||||
|         "port": 22 | ||||
|     }, | ||||
|     "ib_hcas": { | ||||
|         "mlx5_0": "ibp26s0", | ||||
|         "mlx5_3": "ibp60s0", | ||||
|         "mlx5_4": "ibp77s0", | ||||
|         "mlx5_5": "ibp94s0", | ||||
|         "mlx5_6": "ibp156s0", | ||||
|         "mlx5_9": "ibp188s0", | ||||
|         "mlx5_10": "ibp204s0", | ||||
|         "mlx5_11": "ibp220s0" | ||||
|     }, | ||||
|     "ib_burn": { | ||||
|         "min_link_throughput": 0.50, | ||||
|         "max_link_throughput": 1.00, | ||||
|         "parallel_stage_size": 1024, | ||||
|         "parallel_stage_time": 1800, | ||||
|         "server_startup_time": 15, | ||||
|         "writer_startup_time": 1 | ||||
|     } | ||||
| } | ||||
							
								
								
									
										281
									
								
								ib_burn/ib_burn.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										281
									
								
								ib_burn/ib_burn.py
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,281 @@ | ||||
| #!/usr/bin/env python3 | ||||
|  | ||||
| """ | ||||
| Generates a comprehensive IB fabric burn-in workload (ib_burn.sh). | ||||
|  | ||||
| Gathers the necessary information from config.json and files generated by ./ib_fabric.sh (run it first!). | ||||
|  | ||||
| We assume that if mutiple shortest paths exist between two switches, the traffic is evenly distributed among them. | ||||
|  | ||||
| That assumption holds for IB fabrics with AR (Adaptive Routing) enabled, but your mileage may vary. | ||||
|  | ||||
| This script can't be directly used with ethernet / RoCE fabrics, but the general logic should work. | ||||
|  | ||||
| You'll want to garther traffic counters before and after the burn-in to verify the results. | ||||
| """ | ||||
|  | ||||
| import collections | ||||
| import itertools | ||||
| import json | ||||
| import random | ||||
|  | ||||
| import networkx | ||||
| import numpy | ||||
|  | ||||
|  | ||||
| CONFIG = json.load(open("config.json")) | ||||
|  | ||||
| MIN_LINK_BW = CONFIG["ib_burn"]["min_link_throughput"] | ||||
| MAX_LINK_BW = CONFIG["ib_burn"]["max_link_throughput"] | ||||
| STAGE_SIZE = CONFIG["ib_burn"]["parallel_stage_size"] | ||||
|  | ||||
| PARALLEL_STAGE_TIME = CONFIG["ib_burn"]["parallel_stage_time"] | ||||
| SERVER_STARTUP_TIME = CONFIG["ib_burn"]["server_startup_time"] | ||||
| WRITER_STARTUP_TIME = CONFIG["ib_burn"]["writer_startup_time"] | ||||
| STAGE_INNER_TIMEOUT = PARALLEL_STAGE_TIME + SERVER_STARTUP_TIME + WRITER_STARTUP_TIME + 10 | ||||
|  | ||||
|  | ||||
| GUID_TO_HOST_IB_HCA = {} | ||||
| GUID_TO_SWITCH_NAME = {} | ||||
|  | ||||
| SWITCH_TO_SWITCH_LINKS = collections.Counter() | ||||
| LEAF_SWITCH_TO_HOST_IB = collections.defaultdict(list) | ||||
|  | ||||
| for line in open("ib_server.list"): | ||||
|     guid, host, hca = line.split() | ||||
|     GUID_TO_HOST_IB_HCA[guid] = (host, hca) | ||||
|  | ||||
|  | ||||
| for line in open("ib_switch.list"): | ||||
|     guid, name = line.split() | ||||
|     GUID_TO_SWITCH_NAME[guid] = name | ||||
|  | ||||
| for line in open("ib_switch.link"): | ||||
|     guid1, guid2, port1, port2 = line.split() | ||||
|     if guid1 in GUID_TO_HOST_IB_HCA: | ||||
|         host, hca = GUID_TO_HOST_IB_HCA[guid1] | ||||
|         LEAF_SWITCH_TO_HOST_IB[guid2].append((host, hca)) | ||||
|     if guid2 in GUID_TO_HOST_IB_HCA: | ||||
|         host, hca = GUID_TO_HOST_IB_HCA[guid2] | ||||
|         LEAF_SWITCH_TO_HOST_IB[guid1].append((host, hca)) | ||||
|     if guid1 not in GUID_TO_SWITCH_NAME: | ||||
|         continue | ||||
|     if guid2 not in GUID_TO_SWITCH_NAME: | ||||
|         continue | ||||
|     SWITCH_TO_SWITCH_LINKS[guid1, guid2] += 1 | ||||
|  | ||||
| GRAPH = networkx.Graph() | ||||
| for switch1, switch2 in SWITCH_TO_SWITCH_LINKS: | ||||
|     name1 = GUID_TO_SWITCH_NAME[switch1] | ||||
|     name2 = GUID_TO_SWITCH_NAME[switch2] | ||||
|     GRAPH.add_edge(switch1, switch2) | ||||
|  | ||||
| GUID_TO_SWITCH_NAME[None] = "SWITCH-RAIL-HOSTS" | ||||
| for guid in list(LEAF_SWITCH_TO_HOST_IB.keys()): | ||||
|     SWITCH_TO_SWITCH_LINKS[None, guid] = len(LEAF_SWITCH_TO_HOST_IB[guid]) | ||||
|     SWITCH_TO_SWITCH_LINKS[guid, None] = len(LEAF_SWITCH_TO_HOST_IB[guid]) | ||||
|  | ||||
|  | ||||
| GUID_TO_SWITCH_INDEX = {None: 0} | ||||
| for guid in LEAF_SWITCH_TO_HOST_IB: | ||||
|     GUID_TO_SWITCH_INDEX.setdefault(guid, len(GUID_TO_SWITCH_INDEX)) | ||||
| for guid in GUID_TO_SWITCH_NAME: | ||||
|     GUID_TO_SWITCH_INDEX.setdefault(guid, len(GUID_TO_SWITCH_INDEX)) | ||||
|  | ||||
| SWITCH_INDEX_TO_GUID = {v: k for k, v in GUID_TO_SWITCH_INDEX.items()} | ||||
|  | ||||
|  | ||||
| BURN_SOURCE_GUIDS = list(LEAF_SWITCH_TO_HOST_IB) | ||||
| BURN_TARGET_GUIDS = list(LEAF_SWITCH_TO_HOST_IB) | ||||
|  | ||||
| THROUGHPUT_SHAPE = ( | ||||
|     len(BURN_SOURCE_GUIDS) + 1, | ||||
|     len(BURN_TARGET_GUIDS) + 1, | ||||
|     len(GUID_TO_SWITCH_INDEX), | ||||
|     len(GUID_TO_SWITCH_INDEX), | ||||
| ) | ||||
|  | ||||
| if False: | ||||
|     THROUGHPUT_MMAP = numpy.memmap("ib_routes.mmap", dtype=numpy.float64, mode='w+', shape=THROUGHPUT_SHAPE) | ||||
|  | ||||
|     for source_guid, target_guid in itertools.product(BURN_SOURCE_GUIDS, BURN_TARGET_GUIDS): | ||||
|         if source_guid == target_guid: | ||||
|             continue | ||||
|  | ||||
|         route_list = list(networkx.all_shortest_paths(GRAPH, source_guid, target_guid)) | ||||
|         route_cost = 1.0 / len(route_list) | ||||
|  | ||||
|         link_bw = collections.Counter() | ||||
|  | ||||
|         for path in route_list: | ||||
|             guid0 = path[0] | ||||
|             link_bw[None, guid0] += route_cost | ||||
|             for guid1, guid2 in zip(path[:-1], path[1:]): | ||||
|                 link_bw[guid1, guid2] += route_cost | ||||
|             link_bw[guid2, None] += route_cost | ||||
|          | ||||
|         link_bw_shape = ( | ||||
|             len(GUID_TO_SWITCH_INDEX), | ||||
|             len(GUID_TO_SWITCH_INDEX), | ||||
|         ) | ||||
|         link_bw_array = numpy.zeros(link_bw_shape, dtype=numpy.float64) | ||||
|  | ||||
|         for link, bandwidth in link_bw.items(): | ||||
|             guid1 = link[0] | ||||
|             guid2 = link[1] | ||||
|             gidx1 = GUID_TO_SWITCH_INDEX[guid1] | ||||
|             gidx2 = GUID_TO_SWITCH_INDEX[guid2] | ||||
|             total_badwidth = SWITCH_TO_SWITCH_LINKS[guid1,guid2] | ||||
|             used_bandwidth = bandwidth / total_badwidth | ||||
|             link_bw_array[gidx1, gidx2] = used_bandwidth | ||||
|          | ||||
|         source_idx = GUID_TO_SWITCH_INDEX[source_guid] | ||||
|         target_idx = GUID_TO_SWITCH_INDEX[target_guid] | ||||
|         THROUGHPUT_MMAP[source_idx, target_idx] = link_bw_array | ||||
|  | ||||
|     THROUGHPUT_MMAP.flush() | ||||
| else: | ||||
|     THROUGHPUT_MMAP = numpy.memmap("ib_routes.mmap", dtype=numpy.float64, mode='r', shape=THROUGHPUT_SHAPE) | ||||
|      | ||||
|  | ||||
| BURN_SOURCE_INDEXES = list(range(1, len(LEAF_SWITCH_TO_HOST_IB) + 1)) | ||||
| BURN_TARGET_INDEXES = list(range(1, len(LEAF_SWITCH_TO_HOST_IB) + 1)) | ||||
|  | ||||
| POSSIBLE_ROUTES = list(itertools.product(BURN_SOURCE_INDEXES, BURN_TARGET_INDEXES)) | ||||
| SELECTED_ROUTES = [] | ||||
|  | ||||
|  | ||||
| NEEDED_SHAPE = ( | ||||
|     len(GUID_TO_SWITCH_INDEX), | ||||
|     len(GUID_TO_SWITCH_INDEX), | ||||
| ) | ||||
| NEEDED_LINKS = numpy.zeros(NEEDED_SHAPE, dtype=numpy.uint8) | ||||
|  | ||||
| for guid1, guid2 in SWITCH_TO_SWITCH_LINKS.keys(): | ||||
|     if guid1 is None: | ||||
|         continue | ||||
|     if guid2 is None: | ||||
|         continue | ||||
|     i = GUID_TO_SWITCH_INDEX[guid1] | ||||
|     j = GUID_TO_SWITCH_INDEX[guid2] | ||||
|     NEEDED_LINKS[i,j] = True | ||||
|  | ||||
|  | ||||
| print(NEEDED_LINKS.sum()) | ||||
|  | ||||
| for stage in range(128): | ||||
|     random.shuffle(POSSIBLE_ROUTES) | ||||
|  | ||||
|     stage_bw = numpy.zeros(NEEDED_SHAPE, dtype=THROUGHPUT_MMAP.dtype) | ||||
|     stage_paths = [] | ||||
|     stage_burns = 0 | ||||
|     for source_idx, target_idx in POSSIBLE_ROUTES: | ||||
|         if source_idx == target_idx: | ||||
|             continue | ||||
|         possible_new_links = (NEEDED_LINKS & (THROUGHPUT_MMAP[source_idx, target_idx] > 0)).sum() | ||||
|         if possible_new_links == 0: | ||||
|             continue | ||||
|  | ||||
|         source_guid = SWITCH_INDEX_TO_GUID[source_idx] | ||||
|         target_guid = SWITCH_INDEX_TO_GUID[target_idx] | ||||
|         max_servers = max( | ||||
|             len(LEAF_SWITCH_TO_HOST_IB[source_guid]), | ||||
|             len(LEAF_SWITCH_TO_HOST_IB[target_guid]), | ||||
|         ) | ||||
|         route_bw = THROUGHPUT_MMAP[source_idx, target_idx] | ||||
|         num_servers = 0 | ||||
|  | ||||
|         for k in range(1, max_servers + 1): | ||||
|             planned_link_bw = stage_bw + route_bw * k | ||||
|             if (planned_link_bw.max() < MAX_LINK_BW): | ||||
|                 num_servers = k | ||||
|             else: | ||||
|                 break | ||||
|  | ||||
|         if (num_servers > 0): | ||||
|             stage_paths.append((source_guid, target_guid, num_servers)) | ||||
|             stage_bw = stage_bw + route_bw * num_servers | ||||
|             stage_burns += num_servers | ||||
|             if (stage_burns > STAGE_SIZE): | ||||
|                 break | ||||
|  | ||||
|     burned_links = (stage_bw >= MIN_LINK_BW) | ||||
|     burned_first = (burned_links & NEEDED_LINKS) | ||||
|  | ||||
|     NEEDED_LINKS[burned_links] = 0 | ||||
|     print(NEEDED_LINKS.sum(), burned_first.sum()) | ||||
|  | ||||
|     if burned_first.any(): | ||||
|         SELECTED_ROUTES.append(stage_paths) | ||||
|      | ||||
|     if (NEEDED_LINKS.sum() == 0): | ||||
|         break | ||||
|  | ||||
| SCRIPT = [ | ||||
|     "#!/bin/bash", | ||||
|     "set -e", | ||||
|     "mkdir -p ./ib_burn_logs", | ||||
| ] | ||||
| STAGES = [] | ||||
| for stage, stage_paths in enumerate(SELECTED_ROUTES): | ||||
|     chunk_count = len(stage_paths) | ||||
|     stage_chunk = [] | ||||
|     for source_guid, target_guid, num_servers in stage_paths: | ||||
|         source_host_hcas = LEAF_SWITCH_TO_HOST_IB[source_guid] | ||||
|         target_host_hcas = LEAF_SWITCH_TO_HOST_IB[target_guid] | ||||
|         paired_host_hcas = list(zip( | ||||
|             random.sample(source_host_hcas, num_servers), | ||||
|             random.sample(target_host_hcas, num_servers), | ||||
|         )) | ||||
|         stage_chunk.append(paired_host_hcas) | ||||
|     for chunk, paired_host_hcas in enumerate(stage_chunk): | ||||
|         SCRIPT.append(f"echo stage {stage}; chunk {chunk} / {chunk_count}; servers") | ||||
|         for (source_host, source_hca), (target_host, target_hca) in paired_host_hcas: | ||||
|             server_port = list(CONFIG["ib_hcas"]).index(target_hca) + 40_000 | ||||
|             target_addr = CONFIG["node_info"]["nodes"].get(target_host, target_host) | ||||
|             target_user = CONFIG["node_info"]["user"] | ||||
|             SCRIPT.append(" ".join(( | ||||
|                 "ssh", | ||||
|                 f"{target_user}@{target_addr}", | ||||
|                 "timeout {STAGE_INNER_TIMEOUT}", | ||||
|                 "stdbuf -oL -eL", | ||||
|                 "ib_write_bw", | ||||
|                 "--CPU-freq", | ||||
|                 "--report_gbits", | ||||
|                 f"--ib-dev={target_hca}", | ||||
|                 f"--duration {PARALLEL_STAGE_TIME}", | ||||
|                 f"--port {server_port}", | ||||
|                 "&> ./ib_burn_logs/{source_host}-{source_hca}-{target_host}-{target_hca}-server.log", | ||||
|                 "&", | ||||
|             ))) | ||||
|         SCRIPT.append(f"sleep {SERVER_STARTUP_TIME}") | ||||
|         SCRIPT.append(f"echo stage {stage}; chunk {chunk} / {chunk_count}; writers") | ||||
|         for (source_host, source_hca), (target_host, target_hca) in paired_host_hcas: | ||||
|             server_port = list(CONFIG["ib_hcas"]).index(target_hca) + 40_000 | ||||
|             target_addr = CONFIG["node_info"]["nodes"].get(target_host, target_host) | ||||
|             source_addr = CONFIG["node_info"]["nodes"].get(source_host, source_host) | ||||
|             source_user = CONFIG["node_info"]["user"] | ||||
|             SCRIPT.append(" ".join(( | ||||
|                 "ssh", | ||||
|                 f"{source_user}@{source_host}", | ||||
|                 "timeout {STAGE_INNER_TIMEOUT}", | ||||
|                 "stdbuf -oL -eL", | ||||
|                 "ib_write_bw", | ||||
|                 "--CPU-freq", | ||||
|                 "--report_gbits", | ||||
|                 f"--ib-dev={source_hca}", | ||||
|                 f"--duration {PARALLEL_STAGE_TIME}", | ||||
|                 f"--port {server_port}", | ||||
|                 f"{target_host}", | ||||
|                 "&> ./ib_burn_logs/{source_host}-{source_hca}-{target_host}-{target_hca}-writer.log", | ||||
|                 "&", | ||||
|             ))) | ||||
|         SCRIPT.append(f"sleep {WRITER_STARTUP_TIME}") | ||||
|     SCRIPT.append(f"echo stage {stage}; awaiting") | ||||
|     SCRIPT.append("wait") | ||||
|     SCRIPT.append(f"echo stage {stage}; finished") | ||||
|  | ||||
|  | ||||
| SCRIPT_FILE = open("ib_burn.sh", "w") | ||||
| SCRIPT_FILE.write("\n".join(SCRIPT) + "\n") | ||||
| SCRIPT_FILE.close() | ||||
							
								
								
									
										35
									
								
								ib_burn/ib_fabric.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										35
									
								
								ib_burn/ib_fabric.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| #!/bin/bash | ||||
| set -e | ||||
| set -u | ||||
|  | ||||
| SOURCE="${BASH_SOURCE[0]}" | ||||
| CONFIG="./config.json" | ||||
|  | ||||
| IB_SERVER_LIST="./ib_server.list" | ||||
| IB_SWITCH_LIST="./ib_switch.list" | ||||
| IB_SWITCH_LINK="./ib_switch.link" | ||||
|  | ||||
| cd $(dirname "$SOURCE") | ||||
|  | ||||
| echo > "$IB_SERVER_LIST" | ||||
|  | ||||
| function read_config { | ||||
|     jq --raw-output --arg ARG "${2:-}" "$1" "$CONFIG" | ||||
| } | ||||
|  | ||||
| SSH_USER=$(read_config '.node_info.user') | ||||
| SSH_PORT=$(read_config '.node_indo.port') | ||||
|  | ||||
| for SSH_HOST in $(read_config '.node_info.nodes | keys[]') | ||||
| do | ||||
|     SSH_ADDR=$(read_config '.node_info.nodes[$ARG]' "$SSH_HOST") | ||||
|     for IB_HCA in $(read_config '.ib_hcas | keys[]') | ||||
|     do | ||||
|         IB_GUID=$(ssh "$SSH_USER@$SSH_ADDR" "ibstat --short $IB_HCA | grep Node | tr ' ' '\t' | cut -f 4") | ||||
|         echo "$IB_GUID $SSH_HOST $IB_HCA" >> "$IB_SERVER_LIST" | ||||
|     done | ||||
| done | ||||
|  | ||||
| ssh "$SSH_USER@$SSH_ADDR" 'ibswitches' | tr -d '"' | awk '{ print($3, $6) }' > "$IB_SWITCH_LIST" | ||||
| ssh "$SSH_USER@$SSH_ADDR" 'iblinkinfo --line --switches-only' | grep 'Active/' \ | ||||
|     | sed -E 's/"[^"]*"//g;s/\([^)]+\)//g;s/\[[^]]*\]//g' | awk '{ print($1, $5, $3, $7) }' > "$IB_SWITCH_LINK" | ||||
| @@ -1,4 +1,6 @@ | ||||
| loguru==0.7.0 | ||||
| networkx==3.1 | ||||
| numpy==1.24.4 | ||||
| tblib==2.0.0 | ||||
| torch==2.3.0 | ||||
| typing_extensions==4.9.0 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Bartosz Wróblewski
					Bartosz Wróblewski