Update comments / README / requirements list

2024-06-28 12:52:40 +03:00 · 2024-06-25 10:03:03 -05:00
parent 2129478bbd
commit c314f3330c
5 changed files with 27 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 This repository contains various scripts developed at Imbue for managing a large cluster of H100s, detecting and fixing hardware issues, and generally ensuring smooth model training. You can read more about our process [here](https://imbue.com/research/70b-infrastructure/)

 The code is organized as follows:
- `gpu_stress_test` tests that the GPUs on each machine are able to allocate large tensors and perform standard operations/
- `health_checks` contains various checks we use to determine which hosts are healthy.
+- `gpu_stress_test` contains a check that the GPUs on each machine are able to allocate large tensors and perform standard operations.
+- `health_checks` contains various checks we use to determine which hosts are healthy, as well as automated solutions to common issues.
 - `host_validation` contains tests to check that the GPUs on a given machine are able to communicate with each other (via NVLink) and with GPUs on other machines (via InfiniBand).
- `ufm_events` contains a script which parses the UFM event log, checks for relevant events, and determines which network ports should be disabled.
+- `ufm_events` contains a script which parses the UFM event log and other logs, checks for relevant events, and determines which network ports should be disabled.
--- a/gpu_stress_test/gpu_stress_test.py
+++ b/gpu_stress_test/gpu_stress_test.py
@@ -1,3 +1,13 @@
+"""
+Usage: 
+```
+python gpu_stress_test.py max_runtime_in_seconds
+```
+
+`max_runtime_in_seconds` is optional and defaults to 300 seconds (5 minutes)
+"""
+
+
 import math
 import socket
 import sys
--- a/health_checks/health_checks.py
+++ b/health_checks/health_checks.py
@@ -1284,7 +1284,6 @@ class GdrEnabledHealthCheckWarning(HealthCheckWarning):
    Raised when GDR is not enabled for this machine.
    """

-    # TODO: find this somewhere in slack
    suggested_remediation: str = "\n".join(
        ("GDR is not enabled for this machine. Run `sudo modprobe nvidia-peermem`",
         "Refer here: https://download.nvidia.com/XFree86/Linux-x86_64/535.183.01/README/nvidia-peermem.html",)
--- a/host_validation/communication_validation_tests.py
+++ b/host_validation/communication_validation_tests.py
@@ -1,3 +1,12 @@
+"""
+Usage:
+```
+python communication_validation_tests.py --test
+```
+
+where test is one of {group_ib, p2p_ib, nvlink, wait, all_single_node}
+"""
+
 import argparse
 import json
 import os
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+loguru==0.7.0
+tblib==2.0.0
+torch==2.3.0
+typing_extensions==4.9.0
+yasoo==0.12.6