rip out stats bloat

This commit is contained in:
Alex Cheema
2024-12-14 21:40:14 +00:00
parent cb4615c95d
commit 06c2e236b8
5 changed files with 0 additions and 68 deletions

View File

@@ -52,7 +52,6 @@ parser.add_argument("--models-seed-dir", type=str, default=None, help="Model see
parser.add_argument("--listen-port", type=int, default=5678, help="Listening port for discovery")
parser.add_argument("--download-quick-check", action="store_true", help="Quick check local path for model shards download")
parser.add_argument("--max-parallel-downloads", type=int, default=4, help="Max parallel downloads for model shards download")
parser.add_argument("--prometheus-client-port", type=int, default=None, help="Prometheus client port")
parser.add_argument("--broadcast-port", type=int, default=5678, help="Broadcast port for discovery")
parser.add_argument("--discovery-module", type=str, choices=["udp", "tailscale", "manual"], default="udp", help="Discovery module to use")
parser.add_argument("--discovery-timeout", type=int, default=30, help="Discovery timeout in seconds")
@@ -170,10 +169,6 @@ def preemptively_start_download(request_id: str, opaque_status: str):
node.on_opaque_status.register("start_download").on_next(preemptively_start_download)
if args.prometheus_client_port:
from exo.stats.metrics import start_metrics_server
start_metrics_server(node, args.prometheus_client_port)
last_broadcast_time = 0

View File

View File

@@ -1,27 +0,0 @@
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
ports:
- "9090:9090"
networks:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
networks:
- monitoring
depends_on:
- prometheus
networks:
monitoring:

View File

@@ -1,29 +0,0 @@
from exo.orchestration import Node
from prometheus_client import start_http_server, Counter, Histogram
import json
# Create metrics to track time spent and requests made.
PROCESS_PROMPT_COUNTER = Counter("process_prompt_total", "Total number of prompts processed", ["node_id"])
PROCESS_TENSOR_COUNTER = Counter("process_tensor_total", "Total number of tensors processed", ["node_id"])
PROCESS_TENSOR_TIME = Histogram("process_tensor_seconds", "Time spent processing tensor", ["node_id"])
def start_metrics_server(node: Node, port: int):
start_http_server(port)
def _on_opaque_status(request_id, opaque_status: str):
status_data = json.loads(opaque_status)
_type = status_data.get("type", "")
node_id = status_data.get("node_id", "")
if _type != "node_status":
return
status = status_data.get("status", "")
if status == "end_process_prompt":
PROCESS_PROMPT_COUNTER.labels(node_id=node_id).inc()
elif status == "end_process_tensor":
elapsed_time_ns = status_data.get("elapsed_time_ns", 0)
PROCESS_TENSOR_COUNTER.labels(node_id=node_id).inc()
PROCESS_TENSOR_TIME.labels(node_id=node_id).observe(elapsed_time_ns/1e9) # Convert ns to seconds
node.on_opaque_status.register("stats").on_next(_on_opaque_status)

View File

@@ -1,7 +0,0 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'exo-node'
static_configs:
- targets: ['host.docker.internal:8005']