From 1f4618877fd043e512e60c23eb183f1cf0f7546b Mon Sep 17 00:00:00 2001 From: ekeith <55766816+evanmkeith@users.noreply.github.com> Date: Wed, 27 May 2026 14:02:58 -0700 Subject: [PATCH] Fix Python remote eval parameter serialization in dev mode ## Summary Fixes Python `bt eval --dev` `/list` responses for evals that define `parameters={...}`. The Python eval runner was emitting parameters as a raw JSON Schema object via `parameters_to_json_schema(...)`. The Braintrust UI expects remote eval parameters to use the serialized parameter container shape, such as `braintrust.staticParameters` or `braintrust.parameters`. As a result, `/list` could return `200 OK` while the UI still failed to parse the evaluator manifest and showed the generic connection/listing error. This updates the Python runner to serialize parameters with the same remote eval parameter container shape used by the Python SDK devserver/push flow. ## Test Plan - Add a Python list-mode fixture with a Pydantic parameter model. - Assert `BT_EVAL_DEV_MODE=list` emits `parameters.type = "braintrust.staticParameters"`. - Run: - `python3 -m py_compile scripts/eval-runner.py tests/evals/py/remote_list_params/eval_remote_list_params.py` - `cargo fmt --check` - `cargo test eval_python_runner_list_mode_serializes_remote_parameter_container --test eval_fixtures -- --nocapture` --- scripts/eval-runner.py | 41 ++++++- tests/eval_fixtures.rs | 116 +++++++++++++++--- .../eval_remote_list_params.py | 30 +++++ .../evals/py/remote_list_params/fixture.json | 4 + 4 files changed, 176 insertions(+), 15 deletions(-) create mode 100644 tests/evals/py/remote_list_params/eval_remote_list_params.py create mode 100644 tests/evals/py/remote_list_params/fixture.json diff --git a/scripts/eval-runner.py b/scripts/eval-runner.py index 6688f912..d7448e6b 100755 --- a/scripts/eval-runner.py +++ b/scripts/eval-runner.py @@ -27,6 +27,10 @@ ) from braintrust.logger import Dataset, init as init_logger_experiment, parent_context, _internal_get_global_state from braintrust.parameters import parameters_to_json_schema, validate_parameters + try: + from braintrust.parameters import serialize_remote_eval_parameters_container + except ImportError: + serialize_remote_eval_parameters_container = None from braintrust.util import eprint from braintrust.span_identifier_v4 import parse_parent except Exception as exc: # pragma: no cover - runtime guard @@ -567,12 +571,47 @@ def build_eval_definitions(evaluator_instances: list[EvaluatorInstance]) -> dict evaluator = evaluator_instance.evaluator scores = [{"name": getattr(score, "__name__", f"scorer_{i}")} for i, score in enumerate(evaluator.scores)] definitions[evaluator.eval_name] = { - "parameters": parameters_to_json_schema(evaluator.parameters) if evaluator.parameters else {}, + "parameters": serialize_eval_parameters_container(evaluator.parameters) + if evaluator.parameters + else {}, "scores": scores, } return definitions +def serialize_eval_parameters_container(parameters: Any) -> dict[str, Any]: + if serialize_remote_eval_parameters_container is not None: + return serialize_remote_eval_parameters_container(parameters) + + schema = parameters_to_json_schema(parameters) + static_parameters: dict[str, Any] = {} + properties = schema.get("properties", {}) + if isinstance(properties, dict): + for name, property_schema in properties.items(): + if not isinstance(property_schema, dict): + continue + + parameter_type = property_schema.get("x-bt-type") + if parameter_type == "prompt": + parameter: dict[str, Any] = {"type": "prompt"} + elif parameter_type == "model": + parameter = {"type": "model"} + else: + parameter = {"type": "data", "schema": property_schema} + + if "default" in property_schema: + parameter["default"] = property_schema["default"] + if isinstance(property_schema.get("description"), str): + parameter["description"] = property_schema["description"] + static_parameters[name] = parameter + + return { + "type": "braintrust.staticParameters", + "schema": static_parameters, + "source": None, + } + + def collect_files(input_path: str) -> list[str]: if os.path.isdir(input_path): matches: list[str] = [] diff --git a/tests/eval_fixtures.rs b/tests/eval_fixtures.rs index fc1bf9a7..665b46cd 100644 --- a/tests/eval_fixtures.rs +++ b/tests/eval_fixtures.rs @@ -447,6 +447,84 @@ fn eval_runner_list_mode_serializes_parameter_defaults() { ); } +#[test] +fn eval_python_runner_list_mode_serializes_remote_parameter_container() { + let _guard = test_lock(); + let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let fixtures_root = root.join("tests").join("evals"); + let fixture_dir = fixtures_root.join("py").join("remote_list_params"); + let python = match ensure_python_env(&fixtures_root.join("py")) { + Some(python) => python, + None => { + if required_runtimes().contains("python") { + panic!("python runtime unavailable for list-mode parameter test"); + } + eprintln!( + "Skipping eval_python_runner_list_mode_serializes_remote_parameter_container (python runtime unavailable)." + ); + return; + } + }; + + let output = Command::new(&python) + .arg(root.join("scripts").join("eval-runner.py")) + .arg("eval_remote_list_params.py") + .current_dir(&fixture_dir) + .env("BT_EVAL_DEV_MODE", "list") + .env("BT_EVAL_NO_SEND_LOGS", "1") + .output() + .expect("run python eval-runner in list mode"); + + if !output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + panic!( + "python list-mode runner failed with status {}\nstdout:\n{}\nstderr:\n{}", + output.status, stdout, stderr + ); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let manifest_line = stdout + .lines() + .rev() + .find(|line| !line.trim().is_empty()) + .expect("runner should emit manifest JSON"); + let manifest: Value = serde_json::from_str(manifest_line).expect("parse manifest json"); + + let evaluator = manifest + .get("test-cli-python-remote-list-params") + .and_then(Value::as_object) + .expect("manifest should include evaluator"); + let parameters = evaluator + .get("parameters") + .and_then(Value::as_object) + .expect("evaluator should include parameter container"); + assert_eq!( + parameters.get("type").and_then(Value::as_str), + Some("braintrust.staticParameters") + ); + assert!( + parameters.get("source").is_some_and(Value::is_null), + "static parameter container should include null source" + ); + + let thresholds = parameters + .get("schema") + .and_then(Value::as_object) + .and_then(|schema| schema.get("thresholds")) + .and_then(Value::as_object) + .expect("thresholds static parameter should exist"); + assert_eq!(thresholds.get("type").and_then(Value::as_str), Some("data")); + assert!( + thresholds + .get("schema") + .and_then(Value::as_object) + .is_some(), + "thresholds data parameter should include a JSON schema" + ); +} + #[test] fn eval_matrix_param_single_runs_all_combinations() { let _guard = test_lock(); @@ -996,9 +1074,13 @@ fn enforce_required_runtimes(fixtures_root: &Path) { let python = ensure_python_env(&fixtures_root.join("py")) .expect("python runtime is required but uv/python is unavailable"); assert!( - python_can_import_braintrust(python.to_string_lossy().as_ref()), + python_can_import_module(python.to_string_lossy().as_ref(), "braintrust"), "python runtime is required but braintrust package is unavailable" ); + assert!( + python_can_import_module(python.to_string_lossy().as_ref(), "pydantic"), + "python runtime is required but pydantic package is unavailable" + ); } } @@ -1095,17 +1177,19 @@ fn ensure_python_env(fixtures_root: &Path) -> Option { } } - if !python_can_import_braintrust(python.to_string_lossy().as_ref()) { - let status = Command::new("uv") - .args([ - "pip", - "install", - "--python", - python.to_string_lossy().as_ref(), - "braintrust", - ]) - .status() - .ok()?; + let mut missing_packages = Vec::new(); + if !python_can_import_module(python.to_string_lossy().as_ref(), "braintrust") { + missing_packages.push("braintrust"); + } + if !python_can_import_module(python.to_string_lossy().as_ref(), "pydantic") { + missing_packages.push("pydantic"); + } + + if !missing_packages.is_empty() { + let python_arg = python.to_string_lossy().to_string(); + let mut args = vec!["pip", "install", "--python", python_arg.as_str()]; + args.extend(missing_packages); + let status = Command::new("uv").args(args).status().ok()?; if !status.success() { return None; } @@ -1122,9 +1206,13 @@ fn venv_python_path(venv: &Path) -> PathBuf { } } -fn python_can_import_braintrust(python: &str) -> bool { +fn python_can_import_module(python: &str, module: &str) -> bool { Command::new(python) - .args(["-c", "import braintrust"]) + .args([ + "-c", + "import importlib, sys; importlib.import_module(sys.argv[1])", + module, + ]) .stdout(Stdio::null()) .stderr(Stdio::null()) .status() diff --git a/tests/evals/py/remote_list_params/eval_remote_list_params.py b/tests/evals/py/remote_list_params/eval_remote_list_params.py new file mode 100644 index 00000000..d6a2aaa0 --- /dev/null +++ b/tests/evals/py/remote_list_params/eval_remote_list_params.py @@ -0,0 +1,30 @@ +from braintrust import Eval +from pydantic import BaseModel, Field + + +class ThresholdParams(BaseModel): + cluster_min_impression_share: float = Field( + default=0.05, + description="Fail clusters below this impression share.", + ) + unassigned_max_impression_share: float = Field( + default=0.15, + description="Fail rows above this unassigned impression share.", + ) + + +def data(): + return [{"input": "test", "expected": "test"}] + + +def task(input, hooks): + return input + + +Eval( + "test-cli-python-remote-list-params", + parameters={"thresholds": ThresholdParams}, + data=data, + task=task, + scores=[], +) diff --git a/tests/evals/py/remote_list_params/fixture.json b/tests/evals/py/remote_list_params/fixture.json new file mode 100644 index 00000000..f77ef9c6 --- /dev/null +++ b/tests/evals/py/remote_list_params/fixture.json @@ -0,0 +1,4 @@ +{ + "runtime": "python", + "files": ["eval_remote_list_params.py"] +}