Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion scripts/eval-runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
)
from braintrust.logger import Dataset, init as init_logger_experiment, parent_context, _internal_get_global_state
from braintrust.parameters import parameters_to_json_schema, validate_parameters
try:
from braintrust.parameters import serialize_remote_eval_parameters_container
except ImportError:
serialize_remote_eval_parameters_container = None
from braintrust.util import eprint
from braintrust.span_identifier_v4 import parse_parent
except Exception as exc: # pragma: no cover - runtime guard
Expand Down Expand Up @@ -567,12 +571,47 @@ def build_eval_definitions(evaluator_instances: list[EvaluatorInstance]) -> dict
evaluator = evaluator_instance.evaluator
scores = [{"name": getattr(score, "__name__", f"scorer_{i}")} for i, score in enumerate(evaluator.scores)]
definitions[evaluator.eval_name] = {
"parameters": parameters_to_json_schema(evaluator.parameters) if evaluator.parameters else {},
"parameters": serialize_eval_parameters_container(evaluator.parameters)
if evaluator.parameters
else {},
"scores": scores,
}
return definitions


def serialize_eval_parameters_container(parameters: Any) -> dict[str, Any]:
if serialize_remote_eval_parameters_container is not None:
return serialize_remote_eval_parameters_container(parameters)

schema = parameters_to_json_schema(parameters)
static_parameters: dict[str, Any] = {}
properties = schema.get("properties", {})
if isinstance(properties, dict):
for name, property_schema in properties.items():
if not isinstance(property_schema, dict):
continue

parameter_type = property_schema.get("x-bt-type")
if parameter_type == "prompt":
parameter: dict[str, Any] = {"type": "prompt"}
elif parameter_type == "model":
parameter = {"type": "model"}
else:
parameter = {"type": "data", "schema": property_schema}

if "default" in property_schema:
parameter["default"] = property_schema["default"]
if isinstance(property_schema.get("description"), str):
parameter["description"] = property_schema["description"]
static_parameters[name] = parameter

return {
"type": "braintrust.staticParameters",
"schema": static_parameters,
"source": None,
}


def collect_files(input_path: str) -> list[str]:
if os.path.isdir(input_path):
matches: list[str] = []
Expand Down
116 changes: 102 additions & 14 deletions tests/eval_fixtures.rs
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,84 @@ fn eval_runner_list_mode_serializes_parameter_defaults() {
);
}

#[test]
fn eval_python_runner_list_mode_serializes_remote_parameter_container() {
let _guard = test_lock();
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let fixtures_root = root.join("tests").join("evals");
let fixture_dir = fixtures_root.join("py").join("remote_list_params");
let python = match ensure_python_env(&fixtures_root.join("py")) {
Some(python) => python,
None => {
if required_runtimes().contains("python") {
panic!("python runtime unavailable for list-mode parameter test");
}
eprintln!(
"Skipping eval_python_runner_list_mode_serializes_remote_parameter_container (python runtime unavailable)."
);
return;
}
};

let output = Command::new(&python)
.arg(root.join("scripts").join("eval-runner.py"))
.arg("eval_remote_list_params.py")
.current_dir(&fixture_dir)
.env("BT_EVAL_DEV_MODE", "list")
.env("BT_EVAL_NO_SEND_LOGS", "1")
.output()
.expect("run python eval-runner in list mode");

if !output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let stderr = String::from_utf8_lossy(&output.stderr);
panic!(
"python list-mode runner failed with status {}\nstdout:\n{}\nstderr:\n{}",
output.status, stdout, stderr
);
}

let stdout = String::from_utf8_lossy(&output.stdout);
let manifest_line = stdout
.lines()
.rev()
.find(|line| !line.trim().is_empty())
.expect("runner should emit manifest JSON");
let manifest: Value = serde_json::from_str(manifest_line).expect("parse manifest json");

let evaluator = manifest
.get("test-cli-python-remote-list-params")
.and_then(Value::as_object)
.expect("manifest should include evaluator");
let parameters = evaluator
.get("parameters")
.and_then(Value::as_object)
.expect("evaluator should include parameter container");
assert_eq!(
parameters.get("type").and_then(Value::as_str),
Some("braintrust.staticParameters")
);
assert!(
parameters.get("source").is_some_and(Value::is_null),
"static parameter container should include null source"
);

let thresholds = parameters
.get("schema")
.and_then(Value::as_object)
.and_then(|schema| schema.get("thresholds"))
.and_then(Value::as_object)
.expect("thresholds static parameter should exist");
assert_eq!(thresholds.get("type").and_then(Value::as_str), Some("data"));
assert!(
thresholds
.get("schema")
.and_then(Value::as_object)
.is_some(),
"thresholds data parameter should include a JSON schema"
);
}

#[test]
fn eval_matrix_param_single_runs_all_combinations() {
let _guard = test_lock();
Expand Down Expand Up @@ -996,9 +1074,13 @@ fn enforce_required_runtimes(fixtures_root: &Path) {
let python = ensure_python_env(&fixtures_root.join("py"))
.expect("python runtime is required but uv/python is unavailable");
assert!(
python_can_import_braintrust(python.to_string_lossy().as_ref()),
python_can_import_module(python.to_string_lossy().as_ref(), "braintrust"),
"python runtime is required but braintrust package is unavailable"
);
assert!(
python_can_import_module(python.to_string_lossy().as_ref(), "pydantic"),
"python runtime is required but pydantic package is unavailable"
);
}
}

Expand Down Expand Up @@ -1095,17 +1177,19 @@ fn ensure_python_env(fixtures_root: &Path) -> Option<PathBuf> {
}
}

if !python_can_import_braintrust(python.to_string_lossy().as_ref()) {
let status = Command::new("uv")
.args([
"pip",
"install",
"--python",
python.to_string_lossy().as_ref(),
"braintrust",
])
.status()
.ok()?;
let mut missing_packages = Vec::new();
if !python_can_import_module(python.to_string_lossy().as_ref(), "braintrust") {
missing_packages.push("braintrust");
}
if !python_can_import_module(python.to_string_lossy().as_ref(), "pydantic") {
missing_packages.push("pydantic");
}

if !missing_packages.is_empty() {
let python_arg = python.to_string_lossy().to_string();
let mut args = vec!["pip", "install", "--python", python_arg.as_str()];
args.extend(missing_packages);
let status = Command::new("uv").args(args).status().ok()?;
if !status.success() {
return None;
}
Expand All @@ -1122,9 +1206,13 @@ fn venv_python_path(venv: &Path) -> PathBuf {
}
}

fn python_can_import_braintrust(python: &str) -> bool {
fn python_can_import_module(python: &str, module: &str) -> bool {
Command::new(python)
.args(["-c", "import braintrust"])
.args([
"-c",
"import importlib, sys; importlib.import_module(sys.argv[1])",
module,
])
.stdout(Stdio::null())
.stderr(Stdio::null())
.status()
Expand Down
30 changes: 30 additions & 0 deletions tests/evals/py/remote_list_params/eval_remote_list_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from braintrust import Eval
from pydantic import BaseModel, Field


class ThresholdParams(BaseModel):
cluster_min_impression_share: float = Field(
default=0.05,
description="Fail clusters below this impression share.",
)
unassigned_max_impression_share: float = Field(
default=0.15,
description="Fail rows above this unassigned impression share.",
)


def data():
return [{"input": "test", "expected": "test"}]


def task(input, hooks):
return input


Eval(
"test-cli-python-remote-list-params",
parameters={"thresholds": ThresholdParams},
data=data,
task=task,
scores=[],
)
4 changes: 4 additions & 0 deletions tests/evals/py/remote_list_params/fixture.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"runtime": "python",
"files": ["eval_remote_list_params.py"]
}
Loading