This commit is contained in:
Alex Cheema
2024-12-11 19:24:43 +00:00
parent d95f40b6c8
commit cdae702673
3 changed files with 186 additions and 145 deletions

80
.github/bootstrap.sh vendored
View File

@@ -216,41 +216,6 @@ defaults write com.apple.Metal ForceMaximumPerformance -bool true
sudo mkdir -p /tmp/mps_cache
sudo chmod 777 /tmp/mps_cache
# Create CPU affinity configuration for performance cores
sudo tee /Library/LaunchDaemons/com.github.runner.cpuaffinity.plist << EOF
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.github.runner.cpuaffinity</string>
<key>ProgramArguments</key>
<array>
<string>/usr/bin/taskpolicy</string>
<string>-p</string>
<string>PERFORMANCE</string>
<string>-b</string>
<string>PERFORMANCE</string>
<string>-t</string>
<string>PERFORMANCE</string>
<string>--cpu-qos</string>
<string>USER_INTERACTIVE</string>
<string>--gpu-qos</string>
<string>USER_INTERACTIVE</string>
<string>--io-qos</string>
<string>USER_INTERACTIVE</string>
<string>--affinity-tag</string>
<string>com.github.runner</string>
<string>${RUNNER_DIR}/run.sh</string>
</array>
<key>RunAtLoad</key>
<true/>
<key>KeepAlive</key>
<true/>
</dict>
</plist>
EOF
# Create and load launch daemon
log "Creating LaunchDaemon service..."
sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
@@ -266,6 +231,21 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
<string>${RUNNER_DIR}</string>
<key>ProgramArguments</key>
<array>
<string>/usr/bin/taskpolicy</string>
<string>-b</string>
<string>PERFORMANCE</string>
<string>-p</string>
<string>PERFORMANCE</string>
<string>-t</string>
<string>PERFORMANCE</string>
<string>--cpu-qos</string>
<string>USER_INTERACTIVE</string>
<string>--gpu-qos</string>
<string>USER_INTERACTIVE</string>
<string>--io-qos</string>
<string>USER_INTERACTIVE</string>
<string>--affinity-tag</string>
<string>com.github.runner</string>
<string>/usr/bin/nice</string>
<string>-n</string>
<string>-20</string>
@@ -299,6 +279,8 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
<key>MLX_METAL_PREWARM</key>
<string>1</string>
<!-- Metal Settings -->
<key>MTL_DEBUG_LAYER</key>
<string>0</string>
<key>METAL_DEBUG_ERROR_MODE</key>
<string>0</string>
<key>METAL_DEVICE_WRAPPER_TYPE</key>
@@ -335,6 +317,13 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
<string>1</string>
<key>PERFORMANCE_MODE</key>
<string>1</string>
<!-- Python Settings -->
<key>PYTHONOPTIMIZE</key>
<string>2</string>
<key>PYTHONUNBUFFERED</key>
<string>1</string>
<key>PYTHONHASHSEED</key>
<string>0</string>
</dict>
<key>RunAtLoad</key>
<true/>
@@ -346,10 +335,31 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
<false/>
<key>AbandonProcessGroup</key>
<false/>
<key>Nice</key>
<integer>-20</integer>
<key>ThrottleInterval</key>
<integer>0</integer>
<key>EnableTransactions</key>
<true/>
<key>EnablePressuredExit</key>
<false/>
<key>HardResourceLimits</key>
<dict>
<key>NumberOfFiles</key>
<integer>524288</integer>
</dict>
<key>SoftResourceLimits</key>
<dict>
<key>NumberOfFiles</key>
<integer>524288</integer>
</dict>
</dict>
</plist>
EOF
# Remove the separate CPU affinity configuration since it's now integrated
sudo rm -f /Library/LaunchDaemons/com.github.runner.cpuaffinity.plist
# Set proper permissions for the LaunchDaemon
sudo chown root:wheel /Library/LaunchDaemons/com.github.runner.plist
sudo chmod 644 /Library/LaunchDaemons/com.github.runner.plist

97
.github/optimize_performance.sh vendored Executable file
View File

@@ -0,0 +1,97 @@
#!/bin/bash
set -e
# Function to log with timestamp
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
log "Applying comprehensive performance optimizations..."
# System-wide power management
log "Configuring power management..."
sudo pmset -a lessbright 0
sudo pmset -a disablesleep 1
sudo pmset -a sleep 0
sudo pmset -a hibernatemode 0
sudo pmset -a autopoweroff 0
sudo pmset -a standby 0
sudo pmset -a powernap 0
sudo pmset -a proximitywake 0
sudo pmset -a tcpkeepalive 1
sudo pmset -a powermode 1
sudo pmset -a gpuswitch 2
sudo pmset -a displaysleep 0
sudo pmset -a disksleep 0
# Memory and kernel optimizations
log "Configuring memory and kernel settings..."
sudo sysctl -w kern.memorystatus_purge_on_warning=0
sudo sysctl -w kern.memorystatus_purge_on_critical=0
sudo sysctl -w kern.timer.coalescing_enabled=0
sudo sysctl -w kern.iogpu.dynamic_memory_management=0
sudo sysctl -w kern.iogpu.dynamic_memory_management_debug=0
# Metal and GPU optimizations
log "Configuring Metal and GPU settings..."
defaults write com.apple.CoreML MPSEnableGPUValidation -bool false
defaults write com.apple.CoreML MPSEnableMetalValidation -bool false
defaults write com.apple.CoreML MPSEnableGPUDebug -bool false
defaults write com.apple.Metal GPUDebug -bool false
defaults write com.apple.Metal GPUValidation -bool false
defaults write com.apple.Metal MetalValidation -bool false
defaults write com.apple.Metal MetalCaptureEnabled -bool false
defaults write com.apple.Metal MTLValidationBehavior -string "Disabled"
defaults write com.apple.Metal EnableMTLDebugLayer -bool false
defaults write com.apple.Metal MTLDebugLevel -int 0
defaults write com.apple.Metal PreferIntegratedGPU -bool false
defaults write com.apple.Metal ForceMaximumPerformance -bool true
defaults write com.apple.Metal MTLPreferredDeviceGPUFrame -bool true
# Create MPS cache directory with proper permissions
sudo mkdir -p /tmp/mps_cache
sudo chmod 777 /tmp/mps_cache
# Process and resource limits
log "Configuring process limits..."
sudo launchctl limit maxfiles 524288 524288
ulimit -n 524288
ulimit -c 0
ulimit -l unlimited
# Export performance-related environment variables
cat << 'EOF' > /tmp/performance_env.sh
# Metal optimizations
export MTL_DEBUG_LAYER=0
export METAL_DEVICE_WRAPPER_TYPE=1
export METAL_DEBUG_ERROR_MODE=0
export METAL_FORCE_PERFORMANCE_MODE=1
export METAL_DEVICE_PRIORITY=high
export METAL_MAX_COMMAND_QUEUES=1024
export METAL_LOAD_LIMIT=0
export METAL_VALIDATION_ENABLED=0
export METAL_ENABLE_VALIDATION_LAYER=0
export OBJC_DEBUG_MISSING_POOLS=NO
export MPS_CACHEDIR=/tmp/mps_cache
# MLX optimizations
export MLX_USE_GPU=1
export MLX_METAL_COMPILE_ASYNC=1
export MLX_METAL_PREALLOCATE=1
export MLX_METAL_MEMORY_GUARD=0
export MLX_METAL_CACHE_KERNELS=1
export MLX_PLACEMENT_POLICY=metal
export MLX_METAL_VALIDATION=0
export MLX_METAL_DEBUG=0
export MLX_FORCE_P_CORES=1
export MLX_METAL_MEMORY_BUDGET=0
export MLX_METAL_PREWARM=1
# Python optimizations
export PYTHONUNBUFFERED=1
export PYTHONOPTIMIZE=2
export PYTHONHASHSEED=0
export PYTHONDONTWRITEBYTECODE=1
EOF
log "Performance optimizations completed. Environment variables written to /tmp/performance_env.sh"

View File

@@ -10,7 +10,7 @@ on:
model:
required: true
type: string
calling_job_name: # New input parameter
calling_job_name:
required: true
type: string
jobs:
@@ -34,20 +34,26 @@ jobs:
env:
HARDWARE_CONFIG: ${{ inputs.config }}
model: ${{ inputs.model }}
# Add performance-related environment variables
MTL_DEBUG_LAYER: 0
METAL_VALIDATION_ENABLED: 0
MLX_METAL_VALIDATION: 0
MLX_METAL_DEBUG: 0
MLX_FORCE_P_CORES: 1
MLX_METAL_PREWARM: 1
PYTHONOPTIMIZE: 2
steps:
- name: Cleanup workspace
run: |
sudo rm -rf "$GITHUB_WORKSPACE"
sudo mkdir -p "$GITHUB_WORKSPACE"
sudo chown -R $(whoami):$(id -g) "$GITHUB_WORKSPACE"
- uses: actions/checkout@v4
- name: Install dependencies
run: |
# First, find where python3.12 is installed
which python3.12 || echo "python3.12 not in PATH"
# Add common Python installation locations to PATH
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
# Now try to create the venv with explicit python3.12
python3.12 -m venv .venv || {
echo "Failed to find python3.12. Checking installation locations:"
ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
@@ -57,143 +63,70 @@ jobs:
pip install --upgrade pip
pip install -e .
pip install boto3==1.35.76
- name: Configure system
- name: Apply Performance Optimizations
run: |
# Disable all power management and performance throttling
sudo pmset -a lessbright 0
sudo pmset -a disablesleep 1
sudo pmset -a sleep 0
sudo pmset -a hibernatemode 0
sudo pmset -a autopoweroff 0
sudo pmset -a standby 0
sudo pmset -a powernap 0
sudo pmset -a proximitywake 0
sudo pmset -a tcpkeepalive 1
sudo pmset -a powermode 1
sudo pmset -a gpuswitch 2
# Make the script executable and run it
chmod +x .github/optimize_performance.sh
./.github/optimize_performance.sh
# Optimize GPU memory allocation
sudo sysctl -w kern.memorystatus_purge_on_warning=0
sudo sysctl -w kern.memorystatus_purge_on_critical=0
# Source the performance environment variables
source /tmp/performance_env.sh
# Additional performance optimizations
sudo sysctl -w kern.timer.coalescing_enabled=0
# Additional runtime optimizations
sudo sysctl -w kern.iogpu.dynamic_memory_management=0
# Optimize Metal performance
defaults write com.apple.CoreML MPSEnableGPUValidation -bool false
defaults write com.apple.CoreML MPSEnableMetalValidation -bool false
defaults write com.apple.CoreML MPSEnableGPUDebug -bool false
defaults write com.apple.Metal GPUDebug -bool false
defaults write com.apple.Metal GPUValidation -bool false
defaults write com.apple.Metal MetalValidation -bool false
defaults write com.apple.Metal MetalCaptureEnabled -bool false
defaults write com.apple.Metal MTLValidationBehavior -string "Disabled"
defaults write com.apple.Metal EnableMTLDebugLayer -bool false
defaults write com.apple.Metal MTLDebugLevel -int 0
defaults write com.apple.Metal PreferIntegratedGPU -bool false
defaults write com.apple.Metal ForceMaximumPerformance -bool true
./configure_mlx.sh
# Check final state
echo "Final system state:"
# Set process scheduling
sudo taskpolicy -b PERFORMANCE
# Verify optimizations
echo "Verifying performance settings..."
pmset -g
sysctl iogpu
sysctl kern.memorystatus_purge_on_warning
sysctl kern.memorystatus_purge_on_critical
- name: Configure process limits
run: |
# Increase resource limits
sudo launchctl limit maxfiles 524288 524288
ulimit -n 524288
# Disable core dumps
ulimit -c 0
# Set max locked memory to unlimited
ulimit -l unlimited
# Set process priority using macOS commands
CURRENT_PID=$PPID
sudo renice -n -20 $CURRENT_PID || true
# Set high performance I/O policy
sudo taskpolicy -d 0 -p $CURRENT_PID || true
# Set Metal environment variables
export METAL_DEVICE_WRAPPER_TYPE=1
export METAL_DEBUG_ERROR_MODE=0
export METAL_FORCE_PERFORMANCE_MODE=1
export METAL_DEVICE_PRIORITY=high
export METAL_MAX_COMMAND_QUEUES=1024
export METAL_LOAD_LIMIT=0
export METAL_VALIDATION_ENABLED=0
export METAL_ENABLE_VALIDATION_LAYER=0
export OBJC_DEBUG_MISSING_POOLS=NO
# MLX optimizations
export MLX_USE_GPU=1
export MLX_METAL_COMPILE_ASYNC=1
export MLX_METAL_PREALLOCATE=1
export MLX_METAL_MEMORY_GUARD=0
export MLX_METAL_CACHE_KERNELS=1
export MLX_PLACEMENT_POLICY=metal
export MLX_METAL_VALIDATION=0
export MLX_METAL_DEBUG=0
export MLX_FORCE_P_CORES=1
export MLX_METAL_MEMORY_BUDGET=0
export MLX_METAL_PREWARM=1
env | grep -E "MLX_|METAL_|MTL_"
- name: Run exo
env:
aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
run: |
# Source performance environment variables
source /tmp/performance_env.sh
# Debug information
echo "Current commit SHA: $GITHUB_SHA"
git rev-parse HEAD
git status
# List existing exo processes
echo "Existing exo processes:"
ps aux | grep exo || true
CALLING_JOB="${{ inputs.calling_job_name }}"
UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
echo "Starting exo process with:"
echo "MY_NODE_ID: ${MY_NODE_ID}"
echo "ALL_NODE_IDS: ${ALL_NODE_IDS}"
echo "Total expected nodes: ${{ strategy.job-total }}"
source .venv/bin/activate
export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
# Check installed exo version
pip show exo
which .venv/bin/exo
echo "Starting exo daemon..."
# Set process scheduling priority
# Set high priority and performance mode
sudo renice -n -20 $$ || true
sudo taskpolicy -d 0 $$ || true
sudo taskpolicy -b PERFORMANCE $$ || true
# Start exo with inherited priority and performance settings
sudo taskpolicy -d 0 .venv/bin/exo \
# Start exo with performance optimizations
sudo taskpolicy -b PERFORMANCE .venv/bin/exo \
--node-id="${MY_NODE_ID}" \
--node-id-filter="${ALL_NODE_IDS}" \
--interface-type-filter="Ethernet" \
--chatgpt-api-port 52415 > output1.log 2>&1 &
PID1=$!
# Set process priority using macOS-specific commands
# Set process and thread priorities
sudo renice -n -20 -p $PID1 || true
sudo taskpolicy -d 0 -p $PID1 || true
sudo taskpolicy -b PERFORMANCE -p $PID1 || true
# Set thread priority for all Python threads
for tid in $(ps -M $PID1 | grep Python | awk '{print $2}'); do
sudo renice -n -20 -p $tid || true
sudo taskpolicy -d 0 -p $tid || true
sudo taskpolicy -b PERFORMANCE -p $tid || true
done
echo "Exo process started with PID: $PID1"
@@ -240,13 +173,14 @@ jobs:
done
fi
# Add system state check
- name: Check System State
- name: Check Final System State
if: always()
run: |
echo "=== Final System State ==="
sudo pmset -g
sudo powermetrics -n 1 -i 1000 --show-process-energy
sudo powermetrics -n 1 -i 1000 --show-process-energy || true
system_profiler SPDisplaysDataType
sysctl iogpu
ps -eo pid,ppid,user,%cpu,%mem,nice,state,command | grep -i python
sudo launchctl list | grep github
env | grep -E "MLX|METAL"
env | grep -E "MLX_|METAL_|MTL_"
echo "=== End Final System State ==="