mirror of
https://github.com/exo-explore/exo.git
synced 2025-10-23 02:57:14 +03:00
rip out stats bloat
This commit is contained in:
@@ -52,7 +52,6 @@ parser.add_argument("--models-seed-dir", type=str, default=None, help="Model see
|
||||
parser.add_argument("--listen-port", type=int, default=5678, help="Listening port for discovery")
|
||||
parser.add_argument("--download-quick-check", action="store_true", help="Quick check local path for model shards download")
|
||||
parser.add_argument("--max-parallel-downloads", type=int, default=4, help="Max parallel downloads for model shards download")
|
||||
parser.add_argument("--prometheus-client-port", type=int, default=None, help="Prometheus client port")
|
||||
parser.add_argument("--broadcast-port", type=int, default=5678, help="Broadcast port for discovery")
|
||||
parser.add_argument("--discovery-module", type=str, choices=["udp", "tailscale", "manual"], default="udp", help="Discovery module to use")
|
||||
parser.add_argument("--discovery-timeout", type=int, default=30, help="Discovery timeout in seconds")
|
||||
@@ -170,10 +169,6 @@ def preemptively_start_download(request_id: str, opaque_status: str):
|
||||
|
||||
node.on_opaque_status.register("start_download").on_next(preemptively_start_download)
|
||||
|
||||
if args.prometheus_client_port:
|
||||
from exo.stats.metrics import start_metrics_server
|
||||
start_metrics_server(node, args.prometheus_client_port)
|
||||
|
||||
last_broadcast_time = 0
|
||||
|
||||
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
ports:
|
||||
- "9090:9090"
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
ports:
|
||||
- "3000:3000"
|
||||
networks:
|
||||
- monitoring
|
||||
depends_on:
|
||||
- prometheus
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
@@ -1,29 +0,0 @@
|
||||
from exo.orchestration import Node
|
||||
from prometheus_client import start_http_server, Counter, Histogram
|
||||
import json
|
||||
|
||||
# Create metrics to track time spent and requests made.
|
||||
PROCESS_PROMPT_COUNTER = Counter("process_prompt_total", "Total number of prompts processed", ["node_id"])
|
||||
PROCESS_TENSOR_COUNTER = Counter("process_tensor_total", "Total number of tensors processed", ["node_id"])
|
||||
PROCESS_TENSOR_TIME = Histogram("process_tensor_seconds", "Time spent processing tensor", ["node_id"])
|
||||
|
||||
|
||||
def start_metrics_server(node: Node, port: int):
|
||||
start_http_server(port)
|
||||
|
||||
def _on_opaque_status(request_id, opaque_status: str):
|
||||
status_data = json.loads(opaque_status)
|
||||
_type = status_data.get("type", "")
|
||||
node_id = status_data.get("node_id", "")
|
||||
if _type != "node_status":
|
||||
return
|
||||
status = status_data.get("status", "")
|
||||
|
||||
if status == "end_process_prompt":
|
||||
PROCESS_PROMPT_COUNTER.labels(node_id=node_id).inc()
|
||||
elif status == "end_process_tensor":
|
||||
elapsed_time_ns = status_data.get("elapsed_time_ns", 0)
|
||||
PROCESS_TENSOR_COUNTER.labels(node_id=node_id).inc()
|
||||
PROCESS_TENSOR_TIME.labels(node_id=node_id).observe(elapsed_time_ns/1e9) # Convert ns to seconds
|
||||
|
||||
node.on_opaque_status.register("stats").on_next(_on_opaque_status)
|
||||
@@ -1,7 +0,0 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'exo-node'
|
||||
static_configs:
|
||||
- targets: ['host.docker.internal:8005']
|
||||
Reference in New Issue
Block a user