mirror of
https://github.com/imbue-ai/cluster-health.git
synced 2024-06-28 12:52:40 +03:00
Update comments / README / requirements list
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
This repository contains various scripts developed at Imbue for managing a large cluster of H100s, detecting and fixing hardware issues, and generally ensuring smooth model training. You can read more about our process [here](https://imbue.com/research/70b-infrastructure/)
|
||||
|
||||
The code is organized as follows:
|
||||
- `gpu_stress_test` tests that the GPUs on each machine are able to allocate large tensors and perform standard operations/
|
||||
- `health_checks` contains various checks we use to determine which hosts are healthy.
|
||||
- `gpu_stress_test` contains a check that the GPUs on each machine are able to allocate large tensors and perform standard operations.
|
||||
- `health_checks` contains various checks we use to determine which hosts are healthy, as well as automated solutions to common issues.
|
||||
- `host_validation` contains tests to check that the GPUs on a given machine are able to communicate with each other (via NVLink) and with GPUs on other machines (via InfiniBand).
|
||||
- `ufm_events` contains a script which parses the UFM event log, checks for relevant events, and determines which network ports should be disabled.
|
||||
- `ufm_events` contains a script which parses the UFM event log and other logs, checks for relevant events, and determines which network ports should be disabled.
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
"""
|
||||
Usage:
|
||||
```
|
||||
python gpu_stress_test.py max_runtime_in_seconds
|
||||
```
|
||||
|
||||
`max_runtime_in_seconds` is optional and defaults to 300 seconds (5 minutes)
|
||||
"""
|
||||
|
||||
|
||||
import math
|
||||
import socket
|
||||
import sys
|
||||
|
||||
@@ -1284,7 +1284,6 @@ class GdrEnabledHealthCheckWarning(HealthCheckWarning):
|
||||
Raised when GDR is not enabled for this machine.
|
||||
"""
|
||||
|
||||
# TODO: find this somewhere in slack
|
||||
suggested_remediation: str = "\n".join(
|
||||
("GDR is not enabled for this machine. Run `sudo modprobe nvidia-peermem`",
|
||||
"Refer here: https://download.nvidia.com/XFree86/Linux-x86_64/535.183.01/README/nvidia-peermem.html",)
|
||||
|
||||
@@ -1,3 +1,12 @@
|
||||
"""
|
||||
Usage:
|
||||
```
|
||||
python communication_validation_tests.py --test
|
||||
```
|
||||
|
||||
where test is one of {group_ib, p2p_ib, nvlink, wait, all_single_node}
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
loguru==0.7.0
|
||||
tblib==2.0.0
|
||||
torch==2.3.0
|
||||
typing_extensions==4.9.0
|
||||
yasoo==0.12.6
|
||||
Reference in New Issue
Block a user