From 1f4618877fd043e512e60c23eb183f1cf0f7546b Mon Sep 17 00:00:00 2001
From: ekeith <55766816+evanmkeith@users.noreply.github.com>
Date: Wed, 27 May 2026 14:02:58 -0700
Subject: [PATCH] Fix Python remote eval parameter serialization in dev mode

## Summary

  Fixes Python `bt eval --dev` `/list` responses for evals that define `parameters={...}`.

  The Python eval runner was emitting parameters as a raw JSON Schema object via `parameters_to_json_schema(...)`. The Braintrust UI expects remote eval parameters to use the
  serialized parameter container shape, such as `braintrust.staticParameters` or `braintrust.parameters`. As a result, `/list` could return `200 OK` while the UI still failed to
  parse the evaluator manifest and showed the generic connection/listing error.

  This updates the Python runner to serialize parameters with the same remote eval parameter container shape used by the Python SDK devserver/push flow.

  ## Test Plan

  - Add a Python list-mode fixture with a Pydantic parameter model.
  - Assert `BT_EVAL_DEV_MODE=list` emits `parameters.type = "braintrust.staticParameters"`.
  - Run:
    - `python3 -m py_compile scripts/eval-runner.py tests/evals/py/remote_list_params/eval_remote_list_params.py`
    - `cargo fmt --check`
    - `cargo test eval_python_runner_list_mode_serializes_remote_parameter_container --test eval_fixtures -- --nocapture`
---
 scripts/eval-runner.py                        |  41 ++++++-
 tests/eval_fixtures.rs                        | 116 +++++++++++++++---
 .../eval_remote_list_params.py                |  30 +++++
 .../evals/py/remote_list_params/fixture.json  |   4 +
 4 files changed, 176 insertions(+), 15 deletions(-)
 create mode 100644 tests/evals/py/remote_list_params/eval_remote_list_params.py
 create mode 100644 tests/evals/py/remote_list_params/fixture.json

diff --git a/scripts/eval-runner.py b/scripts/eval-runner.py
index 6688f912..d7448e6b 100755
--- a/scripts/eval-runner.py
+++ b/scripts/eval-runner.py
@@ -27,6 +27,10 @@
     )
     from braintrust.logger import Dataset, init as init_logger_experiment, parent_context, _internal_get_global_state
     from braintrust.parameters import parameters_to_json_schema, validate_parameters
+    try:
+        from braintrust.parameters import serialize_remote_eval_parameters_container
+    except ImportError:
+        serialize_remote_eval_parameters_container = None
     from braintrust.util import eprint
     from braintrust.span_identifier_v4 import parse_parent
 except Exception as exc:  # pragma: no cover - runtime guard
@@ -567,12 +571,47 @@ def build_eval_definitions(evaluator_instances: list[EvaluatorInstance]) -> dict
         evaluator = evaluator_instance.evaluator
         scores = [{"name": getattr(score, "__name__", f"scorer_{i}")} for i, score in enumerate(evaluator.scores)]
         definitions[evaluator.eval_name] = {
-            "parameters": parameters_to_json_schema(evaluator.parameters) if evaluator.parameters else {},
+            "parameters": serialize_eval_parameters_container(evaluator.parameters)
+            if evaluator.parameters
+            else {},
             "scores": scores,
         }
     return definitions
 
 
+def serialize_eval_parameters_container(parameters: Any) -> dict[str, Any]:
+    if serialize_remote_eval_parameters_container is not None:
+        return serialize_remote_eval_parameters_container(parameters)
+
+    schema = parameters_to_json_schema(parameters)
+    static_parameters: dict[str, Any] = {}
+    properties = schema.get("properties", {})
+    if isinstance(properties, dict):
+        for name, property_schema in properties.items():
+            if not isinstance(property_schema, dict):
+                continue
+
+            parameter_type = property_schema.get("x-bt-type")
+            if parameter_type == "prompt":
+                parameter: dict[str, Any] = {"type": "prompt"}
+            elif parameter_type == "model":
+                parameter = {"type": "model"}
+            else:
+                parameter = {"type": "data", "schema": property_schema}
+
+            if "default" in property_schema:
+                parameter["default"] = property_schema["default"]
+            if isinstance(property_schema.get("description"), str):
+                parameter["description"] = property_schema["description"]
+            static_parameters[name] = parameter
+
+    return {
+        "type": "braintrust.staticParameters",
+        "schema": static_parameters,
+        "source": None,
+    }
+
+
 def collect_files(input_path: str) -> list[str]:
     if os.path.isdir(input_path):
         matches: list[str] = []
diff --git a/tests/eval_fixtures.rs b/tests/eval_fixtures.rs
index fc1bf9a7..665b46cd 100644
--- a/tests/eval_fixtures.rs
+++ b/tests/eval_fixtures.rs
@@ -447,6 +447,84 @@ fn eval_runner_list_mode_serializes_parameter_defaults() {
     );
 }
 
+#[test]
+fn eval_python_runner_list_mode_serializes_remote_parameter_container() {
+    let _guard = test_lock();
+    let root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    let fixtures_root = root.join("tests").join("evals");
+    let fixture_dir = fixtures_root.join("py").join("remote_list_params");
+    let python = match ensure_python_env(&fixtures_root.join("py")) {
+        Some(python) => python,
+        None => {
+            if required_runtimes().contains("python") {
+                panic!("python runtime unavailable for list-mode parameter test");
+            }
+            eprintln!(
+                "Skipping eval_python_runner_list_mode_serializes_remote_parameter_container (python runtime unavailable)."
+            );
+            return;
+        }
+    };
+
+    let output = Command::new(&python)
+        .arg(root.join("scripts").join("eval-runner.py"))
+        .arg("eval_remote_list_params.py")
+        .current_dir(&fixture_dir)
+        .env("BT_EVAL_DEV_MODE", "list")
+        .env("BT_EVAL_NO_SEND_LOGS", "1")
+        .output()
+        .expect("run python eval-runner in list mode");
+
+    if !output.status.success() {
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        panic!(
+            "python list-mode runner failed with status {}\nstdout:\n{}\nstderr:\n{}",
+            output.status, stdout, stderr
+        );
+    }
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let manifest_line = stdout
+        .lines()
+        .rev()
+        .find(|line| !line.trim().is_empty())
+        .expect("runner should emit manifest JSON");
+    let manifest: Value = serde_json::from_str(manifest_line).expect("parse manifest json");
+
+    let evaluator = manifest
+        .get("test-cli-python-remote-list-params")
+        .and_then(Value::as_object)
+        .expect("manifest should include evaluator");
+    let parameters = evaluator
+        .get("parameters")
+        .and_then(Value::as_object)
+        .expect("evaluator should include parameter container");
+    assert_eq!(
+        parameters.get("type").and_then(Value::as_str),
+        Some("braintrust.staticParameters")
+    );
+    assert!(
+        parameters.get("source").is_some_and(Value::is_null),
+        "static parameter container should include null source"
+    );
+
+    let thresholds = parameters
+        .get("schema")
+        .and_then(Value::as_object)
+        .and_then(|schema| schema.get("thresholds"))
+        .and_then(Value::as_object)
+        .expect("thresholds static parameter should exist");
+    assert_eq!(thresholds.get("type").and_then(Value::as_str), Some("data"));
+    assert!(
+        thresholds
+            .get("schema")
+            .and_then(Value::as_object)
+            .is_some(),
+        "thresholds data parameter should include a JSON schema"
+    );
+}
+
 #[test]
 fn eval_matrix_param_single_runs_all_combinations() {
     let _guard = test_lock();
@@ -996,9 +1074,13 @@ fn enforce_required_runtimes(fixtures_root: &Path) {
         let python = ensure_python_env(&fixtures_root.join("py"))
             .expect("python runtime is required but uv/python is unavailable");
         assert!(
-            python_can_import_braintrust(python.to_string_lossy().as_ref()),
+            python_can_import_module(python.to_string_lossy().as_ref(), "braintrust"),
             "python runtime is required but braintrust package is unavailable"
         );
+        assert!(
+            python_can_import_module(python.to_string_lossy().as_ref(), "pydantic"),
+            "python runtime is required but pydantic package is unavailable"
+        );
     }
 }
 
@@ -1095,17 +1177,19 @@ fn ensure_python_env(fixtures_root: &Path) -> Option<PathBuf> {
         }
     }
 
-    if !python_can_import_braintrust(python.to_string_lossy().as_ref()) {
-        let status = Command::new("uv")
-            .args([
-                "pip",
-                "install",
-                "--python",
-                python.to_string_lossy().as_ref(),
-                "braintrust",
-            ])
-            .status()
-            .ok()?;
+    let mut missing_packages = Vec::new();
+    if !python_can_import_module(python.to_string_lossy().as_ref(), "braintrust") {
+        missing_packages.push("braintrust");
+    }
+    if !python_can_import_module(python.to_string_lossy().as_ref(), "pydantic") {
+        missing_packages.push("pydantic");
+    }
+
+    if !missing_packages.is_empty() {
+        let python_arg = python.to_string_lossy().to_string();
+        let mut args = vec!["pip", "install", "--python", python_arg.as_str()];
+        args.extend(missing_packages);
+        let status = Command::new("uv").args(args).status().ok()?;
         if !status.success() {
             return None;
         }
@@ -1122,9 +1206,13 @@ fn venv_python_path(venv: &Path) -> PathBuf {
     }
 }
 
-fn python_can_import_braintrust(python: &str) -> bool {
+fn python_can_import_module(python: &str, module: &str) -> bool {
     Command::new(python)
-        .args(["-c", "import braintrust"])
+        .args([
+            "-c",
+            "import importlib, sys; importlib.import_module(sys.argv[1])",
+            module,
+        ])
         .stdout(Stdio::null())
         .stderr(Stdio::null())
         .status()
diff --git a/tests/evals/py/remote_list_params/eval_remote_list_params.py b/tests/evals/py/remote_list_params/eval_remote_list_params.py
new file mode 100644
index 00000000..d6a2aaa0
--- /dev/null
+++ b/tests/evals/py/remote_list_params/eval_remote_list_params.py
@@ -0,0 +1,30 @@
+from braintrust import Eval
+from pydantic import BaseModel, Field
+
+
+class ThresholdParams(BaseModel):
+    cluster_min_impression_share: float = Field(
+        default=0.05,
+        description="Fail clusters below this impression share.",
+    )
+    unassigned_max_impression_share: float = Field(
+        default=0.15,
+        description="Fail rows above this unassigned impression share.",
+    )
+
+
+def data():
+    return [{"input": "test", "expected": "test"}]
+
+
+def task(input, hooks):
+    return input
+
+
+Eval(
+    "test-cli-python-remote-list-params",
+    parameters={"thresholds": ThresholdParams},
+    data=data,
+    task=task,
+    scores=[],
+)
diff --git a/tests/evals/py/remote_list_params/fixture.json b/tests/evals/py/remote_list_params/fixture.json
new file mode 100644
index 00000000..f77ef9c6
--- /dev/null
+++ b/tests/evals/py/remote_list_params/fixture.json
@@ -0,0 +1,4 @@
+{
+  "runtime": "python",
+  "files": ["eval_remote_list_params.py"]
+}