From 075b40589481a52c1c8261f7d7d9bffd8c821af5 Mon Sep 17 00:00:00 2001
From: sufubao <sufubao@sensetime.com>
Date: Thu, 11 Jun 2026 08:20:18 +0000
Subject: [PATCH 1/5] feat(metrics): add model_name label and new
 throughput/cache metrics

- Add model_name label to all Prometheus metrics (histograms, counters, gauges)
  so metrics can be distinguished when multiple models are served
- Add counter_inc_by() method to Monitor, MetricServer and MetricClient
  for incrementing counters by arbitrary amounts
- Add new metrics:
  - lightllm_prompt_tokens_total: total prefill tokens processed
  - lightllm_generation_tokens_total: total generation tokens processed
  - lightllm_cache_hit_rate: prefix cache hit rate
  - lightllm_gen_throughput: generation throughput (tokens/s)
  - lightllm_num_running_reqs: number of running requests

Ported from qwen35 branch.
---
 lightllm/server/metrics/manager.py | 10 +++++++
 lightllm/server/metrics/metrics.py | 43 ++++++++++++++++++------------
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/lightllm/server/metrics/manager.py b/lightllm/server/metrics/manager.py
index f3b1a5275b..a95ddc0236 100644
--- a/lightllm/server/metrics/manager.py
+++ b/lightllm/server/metrics/manager.py
@@ -48,6 +48,9 @@ def on_disconnect(self, conn):
     def exposed_counter_inc(self, name: str, label: str = None) -> None:
         return self.monitor.counter_inc(name, label)
 
+    def exposed_counter_inc_by(self, name: str, amount: float) -> None:
+        return self.monitor.counter_inc_by(name, amount)
+
     def exposed_histogram_observe(self, name: str, value: float, label: str = None) -> None:
         return self.monitor.histogram_observe(name, value, label)
 
@@ -106,6 +109,13 @@ def inner_func():
         self._append_task(inner_func)
         return
 
+    def counter_inc_by(self, *args, **kwargs):
+        def inner_func():
+            return self.conn.root.counter_inc_by(*args, **kwargs)
+
+        self._append_task(inner_func)
+        return
+
     def histogram_observe(self, *args, **kwargs):
         def inner_func():
             return self.conn.root.histogram_observe(*args, **kwargs)
diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py
index 130f32c7a7..5debbd07ed 100644
--- a/lightllm/server/metrics/metrics.py
+++ b/lightllm/server/metrics/metrics.py
@@ -27,6 +27,11 @@
     "lightllm_cache_ratio": "cache length / input_length",
     "lightllm_batch_current_max_tokens": "dynamic max token used for current batch",
     "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step",
+    "lightllm_prompt_tokens_total": "Total number of prefill tokens processed",
+    "lightllm_generation_tokens_total": "Total number of generation tokens processed",
+    "lightllm_cache_hit_rate": "Prefix cache hit rate",
+    "lightllm_gen_throughput": "Generation throughput (tokens/s)",
+    "lightllm_num_running_reqs": "Number of running requests",
 }
 
 
@@ -60,6 +65,7 @@ def __init__(self, args):
         self.init_metrics(args)
 
     def init_metrics(self, args):
+        self.model_name = args.model_name
 
         self.create_histogram("lightllm_request_duration", self.duration_buckets)
         self.create_histogram("lightllm_request_validation_duration", self.duration_buckets)
@@ -100,40 +106,43 @@ def init_metrics(self, args):
             mtp_avg_token_per_step_buckets = [1.0, 2.0]
         self.create_histogram("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step_buckets)
 
+        self.create_counter("lightllm_prompt_tokens_total")
+        self.create_counter("lightllm_generation_tokens_total")
+        self.create_gauge("lightllm_cache_hit_rate")
+        self.create_gauge("lightllm_gen_throughput")
+        self.create_gauge("lightllm_num_running_reqs")
+
     def create_histogram(self, name, buckets, labelnames=None):
-        if labelnames is None:
-            histogram = Histogram(name, MONITOR_INFO[name], buckets=buckets, registry=self.registry)
-        else:
-            histogram = Histogram(
-                name, MONITOR_INFO[name], labelnames=labelnames, buckets=buckets, registry=self.registry
-            )
+        all_labels = ["model_name"] + (labelnames or [])
+        histogram = Histogram(name, MONITOR_INFO[name], labelnames=all_labels, buckets=buckets, registry=self.registry)
         self.monitor_registry[name] = histogram
 
     def create_counter(self, name, labelnames=None):
-        if labelnames is None:
-            histogram = Counter(name, MONITOR_INFO[name], registry=self.registry)
-        else:
-            histogram = Counter(name, MONITOR_INFO[name], labelnames=labelnames, registry=self.registry)
-        self.monitor_registry[name] = histogram
+        all_labels = ["model_name"] + (labelnames or [])
+        counter = Counter(name, MONITOR_INFO[name], labelnames=all_labels, registry=self.registry)
+        self.monitor_registry[name] = counter
 
     def create_gauge(self, name):
-        gauge = Gauge(name, MONITOR_INFO[name], registry=self.registry)
+        gauge = Gauge(name, MONITOR_INFO[name], labelnames=["model_name"], registry=self.registry)
         self.monitor_registry[name] = gauge
 
     def counter_inc(self, name, label=None):
         if label is None:
-            self.monitor_registry[name].inc()
+            self.monitor_registry[name].labels(model_name=self.model_name).inc()
         else:
-            self.monitor_registry[name].labels(method=label).inc()
+            self.monitor_registry[name].labels(model_name=self.model_name, method=label).inc()
+
+    def counter_inc_by(self, name, amount):
+        self.monitor_registry[name].labels(model_name=self.model_name).inc(amount)
 
     def histogram_observe(self, name, value, label=None):
         if label is None:
-            self.monitor_registry[name].observe(value)
+            self.monitor_registry[name].labels(model_name=self.model_name).observe(value)
         else:
-            self.monitor_registry[name].labels(method=label).observe(value)
+            self.monitor_registry[name].labels(model_name=self.model_name, method=label).observe(value)
 
     def gauge_set(self, name, value):
-        self.monitor_registry[name].set(value)
+        self.monitor_registry[name].labels(model_name=self.model_name).set(value)
 
     def push_metrices(self):
         if self.gateway_url is not None:

From 00ebd5b80de1ddddb29b0783a3cb674c3b6aa496 Mon Sep 17 00:00:00 2001
From: jyily <jyily.work@gmail.com>
Date: Thu, 11 Jun 2026 08:39:48 +0000
Subject: [PATCH 2/5] feat(metrics): populate new throughput/cache metrics from
 router

Port the metric-reporting part of qwen35's SystemStatusReporter so the
new metrics actually receive values on main:

- lightllm_prompt_tokens_total: incremented with batch.input_tokens()
  when a prefill batch is dispatched
- lightllm_generation_tokens_total: incremented per decode step with
  the number of running requests
- lightllm_cache_hit_rate / lightllm_gen_throughput /
  lightllm_num_running_reqs: gauges refreshed every log_stats_interval
  seconds (min 5s), same cadence and semantics as the qwen35 branch

Unlike qwen35, main's existing router logging is left untouched; only
the /metrics reporting is ported.
---
 lightllm/server/router/manager.py | 17 ++++++++-
 lightllm/server/router/stats.py   | 62 +++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index a5419adb9e..2dd534ee2b 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -33,8 +33,7 @@
 from lightllm.utils.process_check import start_parent_check_thread
 from lightllm.utils.envs_utils import get_unique_server_name
 from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
-from .stats import RouterStatics
-
+from .stats import RouterStatics, SystemStatusReporter
 
 logger = init_logger(__name__)
 
@@ -103,6 +102,7 @@ def __init__(self, args: StartArgs):
             else CpuKvCacheClient(only_create_meta_data=True, init_shm_data=False)
         )
         self.router_statics = RouterStatics(self.args)
+        self.status_reporter: SystemStatusReporter = None
         return
 
     async def wait_to_model_ready(self):
@@ -194,6 +194,7 @@ async def wait_to_model_ready(self):
             )
         self.req_queue = build_req_queue(self.args, self, self.dp_size_in_node)
         logger.info(f"use req queue {self.req_queue.__class__.__name__}")
+        self.status_reporter = SystemStatusReporter(args=self.args, metric_client=self.metric_client)
 
         if self.args.run_mode == "prefill":
             from lightllm.server.router.model_infer.mode_backend.pd.prefill_node_impl import (
@@ -223,6 +224,8 @@ async def loop_for_fwd(
             await self._step()
             counter_count += 1
             if self.running_batch is not None:
+                # 统计 decode 阶段输出 token 数（每个 running req 每步约产出 1 个 token）
+                self.status_reporter.count_output_tokens(len(self.running_batch.reqs))
                 if counter_count % 100 == 0:
                     for dp_index in range(self.dp_size_in_node):
                         token_ratio1 = self.get_used_tokens(dp_index) / self.max_total_token_num
@@ -266,6 +269,8 @@ async def loop_for_fwd(
                             estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(dp_i)
                             logger.debug(f"dp_i {dp_i} estimated_peak_token_count: {estimated_peak_token_count} \n")
 
+            self.status_reporter.maybe_report(self.running_batch)
+
             await asyncio.sleep(self._get_schedule_time_interval())
 
     async def _step(self):
@@ -294,6 +299,7 @@ async def _step(self):
 
     async def _add_batch(self, batch: Batch):
         # 添加新请求
+        self.status_reporter.count_prompt_tokens(batch.input_tokens())
         reqs = [r.to_router_rpc_obj() for r in batch.reqs]
         while not self.shm_reqs_io_buffer.is_empty():
             await asyncio.sleep(0.001)
@@ -327,6 +333,13 @@ def _add_new_batch_to_running_batch(self, new_batch: Batch):
 
     def _filter_reqs_from_running_batch(self):
         if self.running_batch is not None:
+            # 在过滤前，捕获已完成请求的统计信息，用于计算 cache 命中率
+            for req in self.running_batch.reqs:
+                if req.shm_infer_released:
+                    self.status_reporter.on_request_completed(
+                        input_len=req.input_len,
+                        gpu_cache_len=req.prompt_cache_len,
+                    )
             self.running_batch.filter_out_finished_req(self.shm_req_manager, self.router_statics)
             if self.running_batch.is_clear():
                 self.running_batch = None
diff --git a/lightllm/server/router/stats.py b/lightllm/server/router/stats.py
index b715c5bcb3..ad7018341a 100644
--- a/lightllm/server/router/stats.py
+++ b/lightllm/server/router/stats.py
@@ -1,9 +1,71 @@
+import time
 from lightllm.utils.log_utils import init_logger
 from lightllm.server.core.objs import StartArgs
 
 logger = init_logger(__name__)
 
 
+class SystemStatusReporter:
+    """统计 token 吞吐和 prefix cache 命中情况，并周期性上报到 prometheus 监控指标。"""
+
+    def __init__(self, args: StartArgs, metric_client=None):
+        self.enabled = not args.disable_log_stats
+        self.interval = max(5, args.log_stats_interval)
+        if args.log_stats_interval < 5:
+            logger.warning(f"log_stats_interval={args.log_stats_interval}s is below minimum, using 5s")
+        self.metric_client = metric_client
+
+        # 窗口期计数器（每个上报周期重置）
+        self.last_report_time = time.time()
+        self.prompt_tokens = 0
+        self.output_tokens = 0
+
+        # 全局计数器（不重置，用于计算全局 cache 命中率）
+        self.global_input_total = 0
+        self.global_gpu_cache_total = 0
+
+    def count_prompt_tokens(self, num_tokens: int):
+        if self.metric_client is not None:
+            self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", num_tokens)
+        if self.enabled:
+            self.prompt_tokens += num_tokens
+
+    def count_output_tokens(self, num_tokens: int):
+        if self.metric_client is not None:
+            self.metric_client.counter_inc_by("lightllm_generation_tokens_total", num_tokens)
+        if self.enabled:
+            self.output_tokens += num_tokens
+
+    def on_request_completed(self, input_len: int, gpu_cache_len: int):
+        if self.enabled:
+            self.global_input_total += input_len
+            self.global_gpu_cache_total += gpu_cache_len
+
+    def maybe_report(self, running_batch):
+        if not self.enabled:
+            return
+        now = time.time()
+        elapsed = now - self.last_report_time
+        if elapsed < self.interval:
+            return
+
+        output_tps = self.output_tokens / elapsed
+        running = len(running_batch.reqs) if running_batch is not None else 0
+        global_gpu_cache_hit_rate = (
+            (self.global_gpu_cache_total / self.global_input_total) if self.global_input_total > 0 else 0.0
+        )
+
+        if self.metric_client is not None:
+            self.metric_client.gauge_set("lightllm_cache_hit_rate", global_gpu_cache_hit_rate)
+            self.metric_client.gauge_set("lightllm_gen_throughput", output_tps)
+            self.metric_client.gauge_set("lightllm_num_running_reqs", running)
+
+        # 重置窗口期计数器
+        self.prompt_tokens = 0
+        self.output_tokens = 0
+        self.last_report_time = now
+
+
 class RouterStatics:
     def __init__(self, args: StartArgs):
         self.busy_token_used_ratio = args.router_token_ratio

From 6c11387cad22dd4966ebb4f112d0b315b7eae98a Mon Sep 17 00:00:00 2001
From: jyily <jyily.work@gmail.com>
Date: Thu, 11 Jun 2026 08:48:22 +0000
Subject: [PATCH 3/5] fix(metrics): report total paused req num for
 lightllm_batch_pause_size

Previously the gauge was set inside the per-dp debug loop, so in
multi-dp deployments it only held the last dp's paused count. Align
with qwen35 by reporting the total via _get_paused_req_num().
---
 lightllm/server/router/manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index 2dd534ee2b..edbccb0f47 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -244,7 +244,7 @@ async def loop_for_fwd(
                             f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token"
                         )
                         logger.debug(self.router_statics.log_str())
-                        self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num)
+                    self.metric_client.gauge_set("lightllm_batch_pause_size", self._get_paused_req_num())
                 # pd decode mode need to update token_load more frequently
                 self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode)
                 self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs))

From f3084e2d0b8f1452f0161273ba21a30199db3861 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 11 Jun 2026 11:30:36 +0000
Subject: [PATCH 4/5] simple the metrics

---
 lightllm/server/httpserver/manager.py | 20 ++++++---
 lightllm/server/metrics/metrics.py    |  4 +-
 lightllm/server/router/manager.py     | 18 ++------
 lightllm/server/router/stats.py       | 62 ---------------------------
 4 files changed, 19 insertions(+), 85 deletions(-)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
index 8fdd277f57..e47692d1b0 100644
--- a/lightllm/server/httpserver/manager.py
+++ b/lightllm/server/httpserver/manager.py
@@ -706,10 +706,10 @@ async def _wait_to_token_package(
                     if self.pd_mode.is_P() and is_first_token:
                         metadata["prompt_ids"] = prompt_ids
 
-                    prompt_cache_len = metadata.pop("prompt_cache_len", 0)
+                    gpu_prompt_cache_len = metadata.pop("prompt_cache_len", 0)
                     cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
                     disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
-                    metadata["prompt_cache_len"] = prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
+                    metadata["prompt_cache_len"] = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
                     sub_req_id_to_mtp_accepted_token_num[sub_req_id] = metadata.get("mtp_accepted_token_num", 0)
 
                     if is_first_token:
@@ -733,9 +733,12 @@ async def _wait_to_token_package(
                         self.per_token_costs.add(mean_per_token_cost_time_ms)
                         x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
                         x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
-                        prompt_cache_ratio = prompt_cache_len / prompt_tokens
+                        gpu_prompt_cache_ratio = gpu_prompt_cache_len / prompt_tokens
                         cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
                         disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
+                        prompt_cache_len = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len
+                        prompt_cache_ratio = prompt_cache_len / prompt_tokens
+                        generation_throughput = out_token_counter / max(total_cost_time_ms / 1000.0, 1e-6)
 
                         mtp_avg_token_per_step = out_token_counter / max(
                             (out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1
@@ -748,9 +751,9 @@ async def _wait_to_token_package(
                             f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} "
                             f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms "
                             f"prompt_token_num:{prompt_tokens} "
-                            f"gpu cache hit: {prompt_cache_len > 0} "
-                            f"gpu_prompt_cache_len:{prompt_cache_len} "
-                            f"gpu_prompt_cache_ratio:{prompt_cache_ratio} "
+                            f"gpu cache hit: {gpu_prompt_cache_ratio > 0} "
+                            f"gpu_prompt_cache_len:{gpu_prompt_cache_len} "
+                            f"gpu_prompt_cache_ratio:{gpu_prompt_cache_ratio} "
                             f"cpu cache hit: {cpu_prompt_cache_len > 0} "
                             f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
                             f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
@@ -759,8 +762,13 @@ async def _wait_to_token_package(
                             f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
                             f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
                         )
+
                         self.metric_client.histogram_observe("lightllm_cache_length", prompt_cache_len)
                         self.metric_client.histogram_observe("lightllm_cache_ratio", prompt_cache_ratio)
+                        self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", prompt_tokens)
+                        self.metric_client.counter_inc_by("lightllm_generation_tokens_total", out_token_counter)
+                        self.metric_client.gauge_set("lightllm_cache_hit_rate", prompt_cache_ratio)
+                        self.metric_client.gauge_set("lightllm_gen_throughput", generation_throughput)
                         self.metric_client.histogram_observe(
                             "lightllm_request_inference_duration", total_cost_time_ms / 1000.0
                         )
diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py
index 5debbd07ed..0d42462c3f 100644
--- a/lightllm/server/metrics/metrics.py
+++ b/lightllm/server/metrics/metrics.py
@@ -29,8 +29,8 @@
     "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step",
     "lightllm_prompt_tokens_total": "Total number of prefill tokens processed",
     "lightllm_generation_tokens_total": "Total number of generation tokens processed",
-    "lightllm_cache_hit_rate": "Prefix cache hit rate",
-    "lightllm_gen_throughput": "Generation throughput (tokens/s)",
+    "lightllm_cache_hit_rate": "Prefix cache hit rate of latest completed request",
+    "lightllm_gen_throughput": "Generation throughput of latest completed request (tokens/s)",
     "lightllm_num_running_reqs": "Number of running requests",
 }
 
diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py
index edbccb0f47..1de4238a5c 100644
--- a/lightllm/server/router/manager.py
+++ b/lightllm/server/router/manager.py
@@ -33,7 +33,7 @@
 from lightllm.utils.process_check import start_parent_check_thread
 from lightllm.utils.envs_utils import get_unique_server_name
 from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
-from .stats import RouterStatics, SystemStatusReporter
+from .stats import RouterStatics
 
 logger = init_logger(__name__)
 
@@ -102,7 +102,6 @@ def __init__(self, args: StartArgs):
             else CpuKvCacheClient(only_create_meta_data=True, init_shm_data=False)
         )
         self.router_statics = RouterStatics(self.args)
-        self.status_reporter: SystemStatusReporter = None
         return
 
     async def wait_to_model_ready(self):
@@ -194,7 +193,6 @@ async def wait_to_model_ready(self):
             )
         self.req_queue = build_req_queue(self.args, self, self.dp_size_in_node)
         logger.info(f"use req queue {self.req_queue.__class__.__name__}")
-        self.status_reporter = SystemStatusReporter(args=self.args, metric_client=self.metric_client)
 
         if self.args.run_mode == "prefill":
             from lightllm.server.router.model_infer.mode_backend.pd.prefill_node_impl import (
@@ -224,8 +222,6 @@ async def loop_for_fwd(
             await self._step()
             counter_count += 1
             if self.running_batch is not None:
-                # 统计 decode 阶段输出 token 数（每个 running req 每步约产出 1 个 token）
-                self.status_reporter.count_output_tokens(len(self.running_batch.reqs))
                 if counter_count % 100 == 0:
                     for dp_index in range(self.dp_size_in_node):
                         token_ratio1 = self.get_used_tokens(dp_index) / self.max_total_token_num
@@ -248,6 +244,7 @@ async def loop_for_fwd(
                 # pd decode mode need to update token_load more frequently
                 self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode)
                 self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs))
+                self.metric_client.gauge_set("lightllm_num_running_reqs", len(self.running_batch.reqs))
                 self.metric_client.gauge_set("lightllm_queue_size", self.req_queue.get_wait_req_num())
                 self.metric_client.gauge_set(
                     "lightllm_batch_current_max_tokens",
@@ -260,6 +257,7 @@ async def loop_for_fwd(
                 self.req_queue.update_token_load(self.running_batch, force_update=True)
                 if counter_count % 300 == 0:
                     self.metric_client.gauge_set("lightllm_batch_current_size", 0.0)
+                    self.metric_client.gauge_set("lightllm_num_running_reqs", 0.0)
                     self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0)
                     self.metric_client.gauge_set("lightllm_queue_size", 0.0)
                     self.metric_client.gauge_set("lightllm_batch_current_max_tokens", 0.0)
@@ -269,8 +267,6 @@ async def loop_for_fwd(
                             estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(dp_i)
                             logger.debug(f"dp_i {dp_i} estimated_peak_token_count: {estimated_peak_token_count} \n")
 
-            self.status_reporter.maybe_report(self.running_batch)
-
             await asyncio.sleep(self._get_schedule_time_interval())
 
     async def _step(self):
@@ -299,7 +295,6 @@ async def _step(self):
 
     async def _add_batch(self, batch: Batch):
         # 添加新请求
-        self.status_reporter.count_prompt_tokens(batch.input_tokens())
         reqs = [r.to_router_rpc_obj() for r in batch.reqs]
         while not self.shm_reqs_io_buffer.is_empty():
             await asyncio.sleep(0.001)
@@ -333,13 +328,6 @@ def _add_new_batch_to_running_batch(self, new_batch: Batch):
 
     def _filter_reqs_from_running_batch(self):
         if self.running_batch is not None:
-            # 在过滤前，捕获已完成请求的统计信息，用于计算 cache 命中率
-            for req in self.running_batch.reqs:
-                if req.shm_infer_released:
-                    self.status_reporter.on_request_completed(
-                        input_len=req.input_len,
-                        gpu_cache_len=req.prompt_cache_len,
-                    )
             self.running_batch.filter_out_finished_req(self.shm_req_manager, self.router_statics)
             if self.running_batch.is_clear():
                 self.running_batch = None
diff --git a/lightllm/server/router/stats.py b/lightllm/server/router/stats.py
index ad7018341a..b715c5bcb3 100644
--- a/lightllm/server/router/stats.py
+++ b/lightllm/server/router/stats.py
@@ -1,71 +1,9 @@
-import time
 from lightllm.utils.log_utils import init_logger
 from lightllm.server.core.objs import StartArgs
 
 logger = init_logger(__name__)
 
 
-class SystemStatusReporter:
-    """统计 token 吞吐和 prefix cache 命中情况，并周期性上报到 prometheus 监控指标。"""
-
-    def __init__(self, args: StartArgs, metric_client=None):
-        self.enabled = not args.disable_log_stats
-        self.interval = max(5, args.log_stats_interval)
-        if args.log_stats_interval < 5:
-            logger.warning(f"log_stats_interval={args.log_stats_interval}s is below minimum, using 5s")
-        self.metric_client = metric_client
-
-        # 窗口期计数器（每个上报周期重置）
-        self.last_report_time = time.time()
-        self.prompt_tokens = 0
-        self.output_tokens = 0
-
-        # 全局计数器（不重置，用于计算全局 cache 命中率）
-        self.global_input_total = 0
-        self.global_gpu_cache_total = 0
-
-    def count_prompt_tokens(self, num_tokens: int):
-        if self.metric_client is not None:
-            self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", num_tokens)
-        if self.enabled:
-            self.prompt_tokens += num_tokens
-
-    def count_output_tokens(self, num_tokens: int):
-        if self.metric_client is not None:
-            self.metric_client.counter_inc_by("lightllm_generation_tokens_total", num_tokens)
-        if self.enabled:
-            self.output_tokens += num_tokens
-
-    def on_request_completed(self, input_len: int, gpu_cache_len: int):
-        if self.enabled:
-            self.global_input_total += input_len
-            self.global_gpu_cache_total += gpu_cache_len
-
-    def maybe_report(self, running_batch):
-        if not self.enabled:
-            return
-        now = time.time()
-        elapsed = now - self.last_report_time
-        if elapsed < self.interval:
-            return
-
-        output_tps = self.output_tokens / elapsed
-        running = len(running_batch.reqs) if running_batch is not None else 0
-        global_gpu_cache_hit_rate = (
-            (self.global_gpu_cache_total / self.global_input_total) if self.global_input_total > 0 else 0.0
-        )
-
-        if self.metric_client is not None:
-            self.metric_client.gauge_set("lightllm_cache_hit_rate", global_gpu_cache_hit_rate)
-            self.metric_client.gauge_set("lightllm_gen_throughput", output_tps)
-            self.metric_client.gauge_set("lightllm_num_running_reqs", running)
-
-        # 重置窗口期计数器
-        self.prompt_tokens = 0
-        self.output_tokens = 0
-        self.last_report_time = now
-
-
 class RouterStatics:
     def __init__(self, args: StartArgs):
         self.busy_token_used_ratio = args.router_token_ratio

From f9f22d44bd230ea2f83e5bc4a888047744373d81 Mon Sep 17 00:00:00 2001
From: shihaobai <1798930569@qq.com>
Date: Thu, 11 Jun 2026 11:42:33 +0000
Subject: [PATCH 5/5] fix tool_check

---
 lightllm/server/function_call_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py
index f204c154ed..dfcb2f8d9e 100644
--- a/lightllm/server/function_call_parser.py
+++ b/lightllm/server/function_call_parser.py
@@ -30,7 +30,7 @@
 from .api_models import Tool
 
 logger = logging.getLogger(__name__)
-ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "True").upper() in ["ON", "TRUE", "1"]
+ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "False").upper() in ["ON", "TRUE", "1"]
 
 TOOLS_TAG_LIST = [
     "<|plugin|>",