/* global React, Button, Icon, LoadingSpinner, Pill, Select, EmptyState, Drawer, useApiResource, apiFetch, formatRelative, formatDuration, shortId, useToast, useAuth, canUseTenantAction, tenantActionDisabledReason, workflowTesterTargetLimit, targetsOverWorkflowLimit, WorkflowTargetPicker, useEscapeToClose, SaTlcOpChip, SaTlcStatusPill */ // // Experiments area // ---------------- // "Did the change improve the MCP?" An experiment runs the same workflow // set across the same harness × model paths against a baseline MCP server // and a variant, then compares pass rate, quality (avg evaluation score), // and efficiency (duration, tool calls) — and flags regressions. // // /experiments → experiments list table. // /experiments/:id → same list with the comparison drawer open // (deep-linkable, refresh-safe). // // Fan-out reuses the benchmark batch machinery server-side; each run row // links to /runs/:id like benchmark cells do. const { useEffect: useEffectX, useMemo: useMemoX, useState: useStateX, } = React; function expFmtPct(rate) { if (rate == null || !Number.isFinite(Number(rate))) return '—'; return `${(Number(rate) * 100).toFixed(1)}%`; } function expFmtScore(score) { if (score == null || !Number.isFinite(Number(score))) return '—'; return Number(score).toFixed(1); } // Signed delta chip. `inverse` flips the good direction for // lower-is-better metrics (duration, tool calls). function ExperimentDelta({ value, suffix = 'pt', scale = 100, digits = 1, inverse = false }) { if (value == null || !Number.isFinite(Number(value))) { return —; } const scaled = Number(value) * scale; const rounded = Number(scaled.toFixed(digits)); const good = inverse ? rounded < 0 : rounded > 0; const cls = rounded === 0 ? 'flat' : good ? 'up' : 'down'; return ( {rounded > 0 ? '+' : ''}{rounded.toFixed(digits)}{suffix} ); } const EXPERIMENT_STATUS_TONES = { queued: 'neutral', running: 'info', completed: 'ok', failed: 'bad', canceled: 'neutral', }; function ExperimentStatusPill({ experiment }) { if (experiment.decision === 'accepted') return Accepted; if (experiment.decision === 'rejected') return Rejected; const status = experiment.effective_status || experiment.status; return {status}; } // --------------------------------------------------------------------------- // Comparison drawer // --------------------------------------------------------------------------- function ExperimentArmCells({ row }) { return ( <> {expFmtPct(row.baseline?.passRate)} {expFmtPct(row.variant?.passRate)} ); } function ExperimentDrawer({ experimentId, navigate, onClose, onChanged }) { const toast = useToast(); const auth = useAuth(); const canDecide = canUseTenantAction(auth, 'editor'); const detail = useApiResource(`/api/experiments/${experimentId}`); const [deciding, setDeciding] = useStateX(false); const experiment = detail.data?.experiment || null; const comparison = detail.data?.comparison || null; const pendingRuns = comparison?.pendingRuns || 0; // Poll while runs are still in flight so the comparison fills in live. useEffectX(() => { if (!pendingRuns) return undefined; const timer = window.setInterval(() => detail.reload(), 15000); return () => window.clearInterval(timer); }, [pendingRuns, detail.reload]); async function decide(decision) { if (deciding) return; setDeciding(true); try { await apiFetch(`/api/experiments/${experimentId}`, { method: 'PATCH', body: JSON.stringify({ decision }), }); toast.show({ tone: decision === 'accepted' ? 'good' : 'info', title: decision === 'accepted' ? 'Variant accepted' : 'Variant rejected', }); await detail.reload(); onChanged?.(); } catch (error) { toast.show({ tone: 'bad', title: 'Could not save decision', description: error.message }); } finally { setDeciding(false); } } const overall = comparison?.overall; const regressions = comparison?.regressions || []; return ( {shortId(experiment.id)} {experiment.created_at ? ` · ${formatRelative(experiment.created_at)}` : ''} ) : null} footer={experiment && !experiment.decision ? (

) : null}> {detail.loading && !detail.data ? (

Loading experiment...

) : detail.error ? ( ) : experiment ? ( <> {experiment.hypothesis && (

Hypothesis

{experiment.hypothesis}

)} {experiment.tool_list_change_id && experiment.tool_list_change_title && (

Tool list change

{/* The structured change this experiment validates (174/#1110). Diff rows reuse the Tool registry's vocabulary so the change reads identically on both surfaces. */}

{experiment.tool_list_change_title}

{(experiment.tool_list_change_edits || []).map((edit, index) => (

{edit.tool_name} {(edit.note || edit.after?.description) && ( — {edit.note || edit.after.description} )}

))}

)} {pendingRuns > 0 && (

{pendingRuns} run{pendingRuns === 1 ? '' : 's'} still in flight — results update live.

)}

Pass rate — baseline vs variant

Baseline · {experiment.baseline_mcp_server_name || 'baseline'}

{expFmtPct(overall?.baseline?.passRate)}

{overall?.baseline?.runs || 0} scored runs

Variant · {experiment.variant_mcp_server_name || 'variant'}

{expFmtPct(overall?.variant?.passRate)}

{overall?.variant?.runs || 0} scored runs

Quality & efficiency signals

Avg score (0–5)

{expFmtScore(overall?.baseline?.avgScore)} → {expFmtScore(overall?.variant?.avgScore)} {' '}

Avg duration

{overall?.baseline?.avgDurationMs != null ? formatDuration(overall.baseline.avgDurationMs) : '—'} {' → '} {overall?.variant?.avgDurationMs != null ? formatDuration(overall.variant.avgDurationMs) : '—'}

Avg tool calls / run

{expFmtScore(overall?.baseline?.avgToolCalls)} → {expFmtScore(overall?.variant?.avgToolCalls)} {' '}

Results by workflow

{(comparison?.byWorkflow || []).map((row) => ( ))}

Workflow	Baseline	Variant	Δ
{row.workflowName \|\| shortId(row.workflowId)}

Results by harness × model

{(comparison?.byPath || []).map((row) => ( ))}

Path	Baseline	Variant	Δ
{row.testerHarness \|\| 'unknown'} · {row.testerModelName \|\| row.testerModelId \|\| 'unknown'}

Regressions

{regressions.length === 0 ? (

No regressions detected across {(comparison?.byPath || []).length} path{(comparison?.byPath || []).length === 1 ? '' : 's'} and {(comparison?.byWorkflow || []).length} workflow{(comparison?.byWorkflow || []).length === 1 ? '' : 's'}.

) : ( {regressions.map((reg, index) => ( ))}

Cell	Kind	Baseline	Variant
{reg.workflowName \|\| shortId(reg.workflowId)} {reg.testerHarness \|\| 'unknown'} · {reg.testerModelName \|\| reg.testerModelId \|\| 'unknown'}	{reg.kind === 'pass_rate' ? 'Pass rate' : reg.kind === 'quality' ? 'Quality' : 'Efficiency'}	{reg.kind === 'efficiency' ? formatDuration(reg.baseline?.avgDurationMs \|\| 0) : reg.kind === 'quality' ? expFmtScore(reg.baseline?.avgScore) : expFmtPct(reg.baseline?.passRate)}	{reg.kind === 'efficiency' ? formatDuration(reg.variant?.avgDurationMs \|\| 0) : reg.kind === 'quality' ? expFmtScore(reg.variant?.avgScore) : expFmtPct(reg.variant?.passRate)}

)}

) : null} ); } // --------------------------------------------------------------------------- // New experiment modal // --------------------------------------------------------------------------- function NewExperimentDialog({ onClose, onCreated, initialToolListChangeId = null }) { const toast = useToast(); const auth = useAuth(); const workflows = useApiResource('/api/workflows'); const sources = useApiResource('/api/mcp-servers'); const catalog = useApiResource('/api/catalog/models'); // Tool-list changes live in the Session Analytics surface (mcp_analytics // flag); the endpoint 403s without it, so don't even fetch. The picker is // purely optional — an experiment without a TLC is a plain A/B of two // servers. const mcpAnalyticsEnabled = auth.me?.featureFlags?.mcpAnalytics === true; const toolListChanges = useApiResource(mcpAnalyticsEnabled ? '/api/mcp-analytics/v2/tool-list-changes' : null); const [name, setName] = useStateX(''); const [hypothesis, setHypothesis] = useStateX(''); const [workflowIds, setWorkflowIds] = useStateX([]); const [baselineId, setBaselineId] = useStateX(''); const [variantId, setVariantId] = useStateX(''); const [toolListChangeId, setToolListChangeId] = useStateX(''); const [testerTargets, setTesterTargets] = useStateX([]); const [saving, setSaving] = useStateX(false); const workflowRows = useMemoX( () => (workflows.data?.rows || []).filter((row) => row.is_active !== false), [workflows.data], ); const sourceRows = useMemoX( () => (sources.data?.rows || []).filter((row) => row.is_active !== false), [sources.data], ); const models = catalog.data?.models || []; const targetLimit = workflowTesterTargetLimit(auth); // Only open changes are attachable (mirrors the server-side guard). const attachableChanges = useMemoX( () => (toolListChanges.data?.changes || []) .filter((change) => ['draft', 'ready', 'in_experiment'].includes(change.status)), [toolListChanges.data], ); function pickToolListChange(id) { setToolListChangeId(id); const change = attachableChanges.find((c) => c.id === id); if (!change) return; // A TLC names the change and the server it edits — prefill the empty // fields so picking one is a one-click setup, never an overwrite. if (!name.trim()) setName(change.title); if (!baselineId && sourceRows.some((row) => row.id === change.mcp_server_id)) { setBaselineId(change.mcp_server_id); } } // "Launch experiment" deep link from a tool-list change card: apply the // same pick-and-prefill once the changes and servers have loaded. One-shot // — after the user touches the picker themselves, never re-apply. const [initialTlcApplied, setInitialTlcApplied] = useStateX(false); useEffectX(() => { if (initialTlcApplied || !initialToolListChangeId) return; if (toolListChanges.loading || sources.loading) return; setInitialTlcApplied(true); if (attachableChanges.some((c) => c.id === initialToolListChangeId)) { pickToolListChange(initialToolListChangeId); } }, [initialTlcApplied, initialToolListChangeId, toolListChanges.loading, sources.loading, attachableChanges]); function toggleWorkflow(id) { setWorkflowIds((current) => ( current.includes(id) ? current.filter((x) => x !== id) : [...current, id] )); } const totalRuns = workflowIds.length * testerTargets.length * 2; const formInvalid = !name.trim() || workflowIds.length === 0 || testerTargets.length === 0 || !baselineId || !variantId || baselineId === variantId || targetsOverWorkflowLimit(testerTargets, targetLimit); async function submit(event) { event.preventDefault(); if (formInvalid || saving) return; setSaving(true); try { const result = await apiFetch('/api/experiments', { method: 'POST', body: JSON.stringify({ name: name.trim(), hypothesis: hypothesis.trim() || null, workflowIds, testerTargets, baselineMcpServerId: baselineId, variantMcpServerId: variantId, toolListChangeId: toolListChangeId || null, }), }); toast.show({ tone: 'good', title: 'Experiment dispatched', description: `${result.summary?.queued ?? 0} runs queued across baseline and variant.`, }); onCreated(result.experiment); } catch (error) { toast.show({ tone: 'bad', title: 'Could not start experiment', description: error.message }); setSaving(false); } } useEscapeToClose({ disabled: saving, onClose }); const sourceOptions = sourceRows.map((row) => ({ value: row.id, label: `${row.name}${row.environment ? ` · ${row.environment}` : ''}`, })); return (

{ if (e.target === e.currentTarget && !saving) onClose(); }}>