From 075b40589481a52c1c8261f7d7d9bffd8c821af5 Mon Sep 17 00:00:00 2001 From: sufubao Date: Thu, 11 Jun 2026 08:20:18 +0000 Subject: [PATCH 1/5] feat(metrics): add model_name label and new throughput/cache metrics - Add model_name label to all Prometheus metrics (histograms, counters, gauges) so metrics can be distinguished when multiple models are served - Add counter_inc_by() method to Monitor, MetricServer and MetricClient for incrementing counters by arbitrary amounts - Add new metrics: - lightllm_prompt_tokens_total: total prefill tokens processed - lightllm_generation_tokens_total: total generation tokens processed - lightllm_cache_hit_rate: prefix cache hit rate - lightllm_gen_throughput: generation throughput (tokens/s) - lightllm_num_running_reqs: number of running requests Ported from qwen35 branch. --- lightllm/server/metrics/manager.py | 10 +++++++ lightllm/server/metrics/metrics.py | 43 ++++++++++++++++++------------ 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/lightllm/server/metrics/manager.py b/lightllm/server/metrics/manager.py index f3b1a5275b..a95ddc0236 100644 --- a/lightllm/server/metrics/manager.py +++ b/lightllm/server/metrics/manager.py @@ -48,6 +48,9 @@ def on_disconnect(self, conn): def exposed_counter_inc(self, name: str, label: str = None) -> None: return self.monitor.counter_inc(name, label) + def exposed_counter_inc_by(self, name: str, amount: float) -> None: + return self.monitor.counter_inc_by(name, amount) + def exposed_histogram_observe(self, name: str, value: float, label: str = None) -> None: return self.monitor.histogram_observe(name, value, label) @@ -106,6 +109,13 @@ def inner_func(): self._append_task(inner_func) return + def counter_inc_by(self, *args, **kwargs): + def inner_func(): + return self.conn.root.counter_inc_by(*args, **kwargs) + + self._append_task(inner_func) + return + def histogram_observe(self, *args, **kwargs): def inner_func(): return self.conn.root.histogram_observe(*args, **kwargs) diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py index 130f32c7a7..5debbd07ed 100644 --- a/lightllm/server/metrics/metrics.py +++ b/lightllm/server/metrics/metrics.py @@ -27,6 +27,11 @@ "lightllm_cache_ratio": "cache length / input_length", "lightllm_batch_current_max_tokens": "dynamic max token used for current batch", "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step", + "lightllm_prompt_tokens_total": "Total number of prefill tokens processed", + "lightllm_generation_tokens_total": "Total number of generation tokens processed", + "lightllm_cache_hit_rate": "Prefix cache hit rate", + "lightllm_gen_throughput": "Generation throughput (tokens/s)", + "lightllm_num_running_reqs": "Number of running requests", } @@ -60,6 +65,7 @@ def __init__(self, args): self.init_metrics(args) def init_metrics(self, args): + self.model_name = args.model_name self.create_histogram("lightllm_request_duration", self.duration_buckets) self.create_histogram("lightllm_request_validation_duration", self.duration_buckets) @@ -100,40 +106,43 @@ def init_metrics(self, args): mtp_avg_token_per_step_buckets = [1.0, 2.0] self.create_histogram("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step_buckets) + self.create_counter("lightllm_prompt_tokens_total") + self.create_counter("lightllm_generation_tokens_total") + self.create_gauge("lightllm_cache_hit_rate") + self.create_gauge("lightllm_gen_throughput") + self.create_gauge("lightllm_num_running_reqs") + def create_histogram(self, name, buckets, labelnames=None): - if labelnames is None: - histogram = Histogram(name, MONITOR_INFO[name], buckets=buckets, registry=self.registry) - else: - histogram = Histogram( - name, MONITOR_INFO[name], labelnames=labelnames, buckets=buckets, registry=self.registry - ) + all_labels = ["model_name"] + (labelnames or []) + histogram = Histogram(name, MONITOR_INFO[name], labelnames=all_labels, buckets=buckets, registry=self.registry) self.monitor_registry[name] = histogram def create_counter(self, name, labelnames=None): - if labelnames is None: - histogram = Counter(name, MONITOR_INFO[name], registry=self.registry) - else: - histogram = Counter(name, MONITOR_INFO[name], labelnames=labelnames, registry=self.registry) - self.monitor_registry[name] = histogram + all_labels = ["model_name"] + (labelnames or []) + counter = Counter(name, MONITOR_INFO[name], labelnames=all_labels, registry=self.registry) + self.monitor_registry[name] = counter def create_gauge(self, name): - gauge = Gauge(name, MONITOR_INFO[name], registry=self.registry) + gauge = Gauge(name, MONITOR_INFO[name], labelnames=["model_name"], registry=self.registry) self.monitor_registry[name] = gauge def counter_inc(self, name, label=None): if label is None: - self.monitor_registry[name].inc() + self.monitor_registry[name].labels(model_name=self.model_name).inc() else: - self.monitor_registry[name].labels(method=label).inc() + self.monitor_registry[name].labels(model_name=self.model_name, method=label).inc() + + def counter_inc_by(self, name, amount): + self.monitor_registry[name].labels(model_name=self.model_name).inc(amount) def histogram_observe(self, name, value, label=None): if label is None: - self.monitor_registry[name].observe(value) + self.monitor_registry[name].labels(model_name=self.model_name).observe(value) else: - self.monitor_registry[name].labels(method=label).observe(value) + self.monitor_registry[name].labels(model_name=self.model_name, method=label).observe(value) def gauge_set(self, name, value): - self.monitor_registry[name].set(value) + self.monitor_registry[name].labels(model_name=self.model_name).set(value) def push_metrices(self): if self.gateway_url is not None: From 00ebd5b80de1ddddb29b0783a3cb674c3b6aa496 Mon Sep 17 00:00:00 2001 From: jyily Date: Thu, 11 Jun 2026 08:39:48 +0000 Subject: [PATCH 2/5] feat(metrics): populate new throughput/cache metrics from router Port the metric-reporting part of qwen35's SystemStatusReporter so the new metrics actually receive values on main: - lightllm_prompt_tokens_total: incremented with batch.input_tokens() when a prefill batch is dispatched - lightllm_generation_tokens_total: incremented per decode step with the number of running requests - lightllm_cache_hit_rate / lightllm_gen_throughput / lightllm_num_running_reqs: gauges refreshed every log_stats_interval seconds (min 5s), same cadence and semantics as the qwen35 branch Unlike qwen35, main's existing router logging is left untouched; only the /metrics reporting is ported. --- lightllm/server/router/manager.py | 17 ++++++++- lightllm/server/router/stats.py | 62 +++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index a5419adb9e..2dd534ee2b 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -33,8 +33,7 @@ from lightllm.utils.process_check import start_parent_check_thread from lightllm.utils.envs_utils import get_unique_server_name from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt -from .stats import RouterStatics - +from .stats import RouterStatics, SystemStatusReporter logger = init_logger(__name__) @@ -103,6 +102,7 @@ def __init__(self, args: StartArgs): else CpuKvCacheClient(only_create_meta_data=True, init_shm_data=False) ) self.router_statics = RouterStatics(self.args) + self.status_reporter: SystemStatusReporter = None return async def wait_to_model_ready(self): @@ -194,6 +194,7 @@ async def wait_to_model_ready(self): ) self.req_queue = build_req_queue(self.args, self, self.dp_size_in_node) logger.info(f"use req queue {self.req_queue.__class__.__name__}") + self.status_reporter = SystemStatusReporter(args=self.args, metric_client=self.metric_client) if self.args.run_mode == "prefill": from lightllm.server.router.model_infer.mode_backend.pd.prefill_node_impl import ( @@ -223,6 +224,8 @@ async def loop_for_fwd( await self._step() counter_count += 1 if self.running_batch is not None: + # 统计 decode 阶段输出 token 数(每个 running req 每步约产出 1 个 token) + self.status_reporter.count_output_tokens(len(self.running_batch.reqs)) if counter_count % 100 == 0: for dp_index in range(self.dp_size_in_node): token_ratio1 = self.get_used_tokens(dp_index) / self.max_total_token_num @@ -266,6 +269,8 @@ async def loop_for_fwd( estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(dp_i) logger.debug(f"dp_i {dp_i} estimated_peak_token_count: {estimated_peak_token_count} \n") + self.status_reporter.maybe_report(self.running_batch) + await asyncio.sleep(self._get_schedule_time_interval()) async def _step(self): @@ -294,6 +299,7 @@ async def _step(self): async def _add_batch(self, batch: Batch): # 添加新请求 + self.status_reporter.count_prompt_tokens(batch.input_tokens()) reqs = [r.to_router_rpc_obj() for r in batch.reqs] while not self.shm_reqs_io_buffer.is_empty(): await asyncio.sleep(0.001) @@ -327,6 +333,13 @@ def _add_new_batch_to_running_batch(self, new_batch: Batch): def _filter_reqs_from_running_batch(self): if self.running_batch is not None: + # 在过滤前,捕获已完成请求的统计信息,用于计算 cache 命中率 + for req in self.running_batch.reqs: + if req.shm_infer_released: + self.status_reporter.on_request_completed( + input_len=req.input_len, + gpu_cache_len=req.prompt_cache_len, + ) self.running_batch.filter_out_finished_req(self.shm_req_manager, self.router_statics) if self.running_batch.is_clear(): self.running_batch = None diff --git a/lightllm/server/router/stats.py b/lightllm/server/router/stats.py index b715c5bcb3..ad7018341a 100644 --- a/lightllm/server/router/stats.py +++ b/lightllm/server/router/stats.py @@ -1,9 +1,71 @@ +import time from lightllm.utils.log_utils import init_logger from lightllm.server.core.objs import StartArgs logger = init_logger(__name__) +class SystemStatusReporter: + """统计 token 吞吐和 prefix cache 命中情况,并周期性上报到 prometheus 监控指标。""" + + def __init__(self, args: StartArgs, metric_client=None): + self.enabled = not args.disable_log_stats + self.interval = max(5, args.log_stats_interval) + if args.log_stats_interval < 5: + logger.warning(f"log_stats_interval={args.log_stats_interval}s is below minimum, using 5s") + self.metric_client = metric_client + + # 窗口期计数器(每个上报周期重置) + self.last_report_time = time.time() + self.prompt_tokens = 0 + self.output_tokens = 0 + + # 全局计数器(不重置,用于计算全局 cache 命中率) + self.global_input_total = 0 + self.global_gpu_cache_total = 0 + + def count_prompt_tokens(self, num_tokens: int): + if self.metric_client is not None: + self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", num_tokens) + if self.enabled: + self.prompt_tokens += num_tokens + + def count_output_tokens(self, num_tokens: int): + if self.metric_client is not None: + self.metric_client.counter_inc_by("lightllm_generation_tokens_total", num_tokens) + if self.enabled: + self.output_tokens += num_tokens + + def on_request_completed(self, input_len: int, gpu_cache_len: int): + if self.enabled: + self.global_input_total += input_len + self.global_gpu_cache_total += gpu_cache_len + + def maybe_report(self, running_batch): + if not self.enabled: + return + now = time.time() + elapsed = now - self.last_report_time + if elapsed < self.interval: + return + + output_tps = self.output_tokens / elapsed + running = len(running_batch.reqs) if running_batch is not None else 0 + global_gpu_cache_hit_rate = ( + (self.global_gpu_cache_total / self.global_input_total) if self.global_input_total > 0 else 0.0 + ) + + if self.metric_client is not None: + self.metric_client.gauge_set("lightllm_cache_hit_rate", global_gpu_cache_hit_rate) + self.metric_client.gauge_set("lightllm_gen_throughput", output_tps) + self.metric_client.gauge_set("lightllm_num_running_reqs", running) + + # 重置窗口期计数器 + self.prompt_tokens = 0 + self.output_tokens = 0 + self.last_report_time = now + + class RouterStatics: def __init__(self, args: StartArgs): self.busy_token_used_ratio = args.router_token_ratio From 6c11387cad22dd4966ebb4f112d0b315b7eae98a Mon Sep 17 00:00:00 2001 From: jyily Date: Thu, 11 Jun 2026 08:48:22 +0000 Subject: [PATCH 3/5] fix(metrics): report total paused req num for lightllm_batch_pause_size Previously the gauge was set inside the per-dp debug loop, so in multi-dp deployments it only held the last dp's paused count. Align with qwen35 by reporting the total via _get_paused_req_num(). --- lightllm/server/router/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index 2dd534ee2b..edbccb0f47 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -244,7 +244,7 @@ async def loop_for_fwd( f"dp_i {d_i} token used ratio: {token_ratio2} contain prompt cache tree unrefed token" ) logger.debug(self.router_statics.log_str()) - self.metric_client.gauge_set("lightllm_batch_pause_size", paused_req_num) + self.metric_client.gauge_set("lightllm_batch_pause_size", self._get_paused_req_num()) # pd decode mode need to update token_load more frequently self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode) self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs)) From f3084e2d0b8f1452f0161273ba21a30199db3861 Mon Sep 17 00:00:00 2001 From: shihaobai <1798930569@qq.com> Date: Thu, 11 Jun 2026 11:30:36 +0000 Subject: [PATCH 4/5] simple the metrics --- lightllm/server/httpserver/manager.py | 20 ++++++--- lightllm/server/metrics/metrics.py | 4 +- lightllm/server/router/manager.py | 18 ++------ lightllm/server/router/stats.py | 62 --------------------------- 4 files changed, 19 insertions(+), 85 deletions(-) diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 8fdd277f57..e47692d1b0 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -706,10 +706,10 @@ async def _wait_to_token_package( if self.pd_mode.is_P() and is_first_token: metadata["prompt_ids"] = prompt_ids - prompt_cache_len = metadata.pop("prompt_cache_len", 0) + gpu_prompt_cache_len = metadata.pop("prompt_cache_len", 0) cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0) disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0) - metadata["prompt_cache_len"] = prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len + metadata["prompt_cache_len"] = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len sub_req_id_to_mtp_accepted_token_num[sub_req_id] = metadata.get("mtp_accepted_token_num", 0) if is_first_token: @@ -733,9 +733,12 @@ async def _wait_to_token_package( self.per_token_costs.add(mean_per_token_cost_time_ms) x_request_id = request.headers.get("X-Request-Id", "") if request is not None else "" x_session_id = request.headers.get("X-Session-Id", "") if request is not None else "" - prompt_cache_ratio = prompt_cache_len / prompt_tokens + gpu_prompt_cache_ratio = gpu_prompt_cache_len / prompt_tokens cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens + prompt_cache_len = gpu_prompt_cache_len + cpu_prompt_cache_len + disk_prompt_cache_len + prompt_cache_ratio = prompt_cache_len / prompt_tokens + generation_throughput = out_token_counter / max(total_cost_time_ms / 1000.0, 1e-6) mtp_avg_token_per_step = out_token_counter / max( (out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1 @@ -748,9 +751,9 @@ async def _wait_to_token_package( f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} " f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms " f"prompt_token_num:{prompt_tokens} " - f"gpu cache hit: {prompt_cache_len > 0} " - f"gpu_prompt_cache_len:{prompt_cache_len} " - f"gpu_prompt_cache_ratio:{prompt_cache_ratio} " + f"gpu cache hit: {gpu_prompt_cache_ratio > 0} " + f"gpu_prompt_cache_len:{gpu_prompt_cache_len} " + f"gpu_prompt_cache_ratio:{gpu_prompt_cache_ratio} " f"cpu cache hit: {cpu_prompt_cache_len > 0} " f"cpu_prompt_cache_len:{cpu_prompt_cache_len} " f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} " @@ -759,8 +762,13 @@ async def _wait_to_token_package( f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} " f"mtp_avg_token_per_step:{mtp_avg_token_per_step} " ) + self.metric_client.histogram_observe("lightllm_cache_length", prompt_cache_len) self.metric_client.histogram_observe("lightllm_cache_ratio", prompt_cache_ratio) + self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", prompt_tokens) + self.metric_client.counter_inc_by("lightllm_generation_tokens_total", out_token_counter) + self.metric_client.gauge_set("lightllm_cache_hit_rate", prompt_cache_ratio) + self.metric_client.gauge_set("lightllm_gen_throughput", generation_throughput) self.metric_client.histogram_observe( "lightllm_request_inference_duration", total_cost_time_ms / 1000.0 ) diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py index 5debbd07ed..0d42462c3f 100644 --- a/lightllm/server/metrics/metrics.py +++ b/lightllm/server/metrics/metrics.py @@ -29,8 +29,8 @@ "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step", "lightllm_prompt_tokens_total": "Total number of prefill tokens processed", "lightllm_generation_tokens_total": "Total number of generation tokens processed", - "lightllm_cache_hit_rate": "Prefix cache hit rate", - "lightllm_gen_throughput": "Generation throughput (tokens/s)", + "lightllm_cache_hit_rate": "Prefix cache hit rate of latest completed request", + "lightllm_gen_throughput": "Generation throughput of latest completed request (tokens/s)", "lightllm_num_running_reqs": "Number of running requests", } diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index edbccb0f47..1de4238a5c 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -33,7 +33,7 @@ from lightllm.utils.process_check import start_parent_check_thread from lightllm.utils.envs_utils import get_unique_server_name from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt -from .stats import RouterStatics, SystemStatusReporter +from .stats import RouterStatics logger = init_logger(__name__) @@ -102,7 +102,6 @@ def __init__(self, args: StartArgs): else CpuKvCacheClient(only_create_meta_data=True, init_shm_data=False) ) self.router_statics = RouterStatics(self.args) - self.status_reporter: SystemStatusReporter = None return async def wait_to_model_ready(self): @@ -194,7 +193,6 @@ async def wait_to_model_ready(self): ) self.req_queue = build_req_queue(self.args, self, self.dp_size_in_node) logger.info(f"use req queue {self.req_queue.__class__.__name__}") - self.status_reporter = SystemStatusReporter(args=self.args, metric_client=self.metric_client) if self.args.run_mode == "prefill": from lightllm.server.router.model_infer.mode_backend.pd.prefill_node_impl import ( @@ -224,8 +222,6 @@ async def loop_for_fwd( await self._step() counter_count += 1 if self.running_batch is not None: - # 统计 decode 阶段输出 token 数(每个 running req 每步约产出 1 个 token) - self.status_reporter.count_output_tokens(len(self.running_batch.reqs)) if counter_count % 100 == 0: for dp_index in range(self.dp_size_in_node): token_ratio1 = self.get_used_tokens(dp_index) / self.max_total_token_num @@ -248,6 +244,7 @@ async def loop_for_fwd( # pd decode mode need to update token_load more frequently self.req_queue.update_token_load(self.running_batch, force_update=self.is_pd_decode_mode) self.metric_client.gauge_set("lightllm_batch_current_size", len(self.running_batch.reqs)) + self.metric_client.gauge_set("lightllm_num_running_reqs", len(self.running_batch.reqs)) self.metric_client.gauge_set("lightllm_queue_size", self.req_queue.get_wait_req_num()) self.metric_client.gauge_set( "lightllm_batch_current_max_tokens", @@ -260,6 +257,7 @@ async def loop_for_fwd( self.req_queue.update_token_load(self.running_batch, force_update=True) if counter_count % 300 == 0: self.metric_client.gauge_set("lightllm_batch_current_size", 0.0) + self.metric_client.gauge_set("lightllm_num_running_reqs", 0.0) self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0) self.metric_client.gauge_set("lightllm_queue_size", 0.0) self.metric_client.gauge_set("lightllm_batch_current_max_tokens", 0.0) @@ -269,8 +267,6 @@ async def loop_for_fwd( estimated_peak_token_count = self.shared_token_load.get_estimated_peak_token_count(dp_i) logger.debug(f"dp_i {dp_i} estimated_peak_token_count: {estimated_peak_token_count} \n") - self.status_reporter.maybe_report(self.running_batch) - await asyncio.sleep(self._get_schedule_time_interval()) async def _step(self): @@ -299,7 +295,6 @@ async def _step(self): async def _add_batch(self, batch: Batch): # 添加新请求 - self.status_reporter.count_prompt_tokens(batch.input_tokens()) reqs = [r.to_router_rpc_obj() for r in batch.reqs] while not self.shm_reqs_io_buffer.is_empty(): await asyncio.sleep(0.001) @@ -333,13 +328,6 @@ def _add_new_batch_to_running_batch(self, new_batch: Batch): def _filter_reqs_from_running_batch(self): if self.running_batch is not None: - # 在过滤前,捕获已完成请求的统计信息,用于计算 cache 命中率 - for req in self.running_batch.reqs: - if req.shm_infer_released: - self.status_reporter.on_request_completed( - input_len=req.input_len, - gpu_cache_len=req.prompt_cache_len, - ) self.running_batch.filter_out_finished_req(self.shm_req_manager, self.router_statics) if self.running_batch.is_clear(): self.running_batch = None diff --git a/lightllm/server/router/stats.py b/lightllm/server/router/stats.py index ad7018341a..b715c5bcb3 100644 --- a/lightllm/server/router/stats.py +++ b/lightllm/server/router/stats.py @@ -1,71 +1,9 @@ -import time from lightllm.utils.log_utils import init_logger from lightllm.server.core.objs import StartArgs logger = init_logger(__name__) -class SystemStatusReporter: - """统计 token 吞吐和 prefix cache 命中情况,并周期性上报到 prometheus 监控指标。""" - - def __init__(self, args: StartArgs, metric_client=None): - self.enabled = not args.disable_log_stats - self.interval = max(5, args.log_stats_interval) - if args.log_stats_interval < 5: - logger.warning(f"log_stats_interval={args.log_stats_interval}s is below minimum, using 5s") - self.metric_client = metric_client - - # 窗口期计数器(每个上报周期重置) - self.last_report_time = time.time() - self.prompt_tokens = 0 - self.output_tokens = 0 - - # 全局计数器(不重置,用于计算全局 cache 命中率) - self.global_input_total = 0 - self.global_gpu_cache_total = 0 - - def count_prompt_tokens(self, num_tokens: int): - if self.metric_client is not None: - self.metric_client.counter_inc_by("lightllm_prompt_tokens_total", num_tokens) - if self.enabled: - self.prompt_tokens += num_tokens - - def count_output_tokens(self, num_tokens: int): - if self.metric_client is not None: - self.metric_client.counter_inc_by("lightllm_generation_tokens_total", num_tokens) - if self.enabled: - self.output_tokens += num_tokens - - def on_request_completed(self, input_len: int, gpu_cache_len: int): - if self.enabled: - self.global_input_total += input_len - self.global_gpu_cache_total += gpu_cache_len - - def maybe_report(self, running_batch): - if not self.enabled: - return - now = time.time() - elapsed = now - self.last_report_time - if elapsed < self.interval: - return - - output_tps = self.output_tokens / elapsed - running = len(running_batch.reqs) if running_batch is not None else 0 - global_gpu_cache_hit_rate = ( - (self.global_gpu_cache_total / self.global_input_total) if self.global_input_total > 0 else 0.0 - ) - - if self.metric_client is not None: - self.metric_client.gauge_set("lightllm_cache_hit_rate", global_gpu_cache_hit_rate) - self.metric_client.gauge_set("lightllm_gen_throughput", output_tps) - self.metric_client.gauge_set("lightllm_num_running_reqs", running) - - # 重置窗口期计数器 - self.prompt_tokens = 0 - self.output_tokens = 0 - self.last_report_time = now - - class RouterStatics: def __init__(self, args: StartArgs): self.busy_token_used_ratio = args.router_token_ratio From f9f22d44bd230ea2f83e5bc4a888047744373d81 Mon Sep 17 00:00:00 2001 From: shihaobai <1798930569@qq.com> Date: Thu, 11 Jun 2026 11:42:33 +0000 Subject: [PATCH 5/5] fix tool_check --- lightllm/server/function_call_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/function_call_parser.py b/lightllm/server/function_call_parser.py index f204c154ed..dfcb2f8d9e 100644 --- a/lightllm/server/function_call_parser.py +++ b/lightllm/server/function_call_parser.py @@ -30,7 +30,7 @@ from .api_models import Tool logger = logging.getLogger(__name__) -ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "True").upper() in ["ON", "TRUE", "1"] +ENABLE_TOOL_NAME_CHECK = os.getenv("LIGHTLLM_ENABLE_TOOL_NAME_CHECK", "False").upper() in ["ON", "TRUE", "1"] TOOLS_TAG_LIST = [ "<|plugin|>",