heaths · heaths · Jun 26, 2026 · Jun 26, 2026
diff --git a/.github/skills/evaluate-skills/SKILL.md b/.github/skills/evaluate-skills/SKILL.md
@@ -10,6 +10,9 @@ Use the [Vally reference docs](https://microsoft.github.io/vally/) for schema an
 This repo keeps evals under `evals/` and currently uses `.vally.yaml` to define the `pr`
 suite from evals tagged with `priority: p0`.
 
+Before running Vally commands in this repo, run `npm i` from the repository root so
+`npx vally` and `npx copilot` use the pinned local versions from `package.json`.
+
 ## Layout
 
 ```text
@@ -26,6 +29,9 @@ evals/
 - Each plugin keeps its eval spec in `evals/<plugin-name>/eval.yaml`.
 - Put sample inputs under `evals/<plugin-name>/fixtures/`.
 - Seed fixture files into the eval environment with `environment.files`.
+- Keep experiment helper scripts in the experiment directory and expose them through
+  an appropriately named root `package.json` script.
+- Write repo experiment helper scripts as ES modules.
 
 ## When to use this skill
 
@@ -55,9 +61,9 @@ Use this skill when the request involves:
 1. Keep `.vally.yaml` in sync if you add new suites or change how evals are grouped.
 2. Use eval `tags` for suite filters such as the current `priority: p0` pull request
    suite.
-3. Lint specs with `npx -y @microsoft/vally-cli@0.6.0 lint --eval-spec evals`.
-4. Run the pull request suite with
-   `COPILOT_GITHUB_TOKEN=... npx -y @microsoft/vally-cli@0.6.0 eval --suite pr --output-dir vally-results --junit`.
+3. Run `npm i` from the repository root before invoking repo CLI tools.
+4. Lint specs with `npx vally lint --eval-spec evals`.
+5. Run the pull request suite with `npx vally eval --suite pr --output-dir vally-results --junit`.
 
 ## Coverage rule
 

diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -12,19 +12,16 @@ jobs:
         with:
           node-version: 22
 
-      - name: Install vally
-        run: npm install -g @microsoft/vally-cli@0.6.0
-
-      - name: Install copilot CLI
-        run: npm install -g @github/copilot@1.0.65
+      - name: Install pinned CLI dependencies
+        run: npm ci
 
       - name: Lint eval specs
-        run: vally lint --eval-spec evals
+        run: npx vally lint --eval-spec evals
 
       - name: Run evals
         env:
           COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
-        run: vally eval --suite pr --output-dir vally-results --junit
+        run: npx vally eval --suite pr --output-dir vally-results --junit
 
       - name: Upload eval results
         if: always()

diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,13 @@
+# Node
+node_modules/
+.worktrees/
+
 # Python
 **/.venv/
 **/__pycache__/
 **/*.pyc
 **/*.pyo
 
 # Vally
+vally-experiment-results/
 vally-results/
diff --git a/AGENTS.md b/AGENTS.md
@@ -66,6 +66,8 @@ Add or update associated eval coverage under `evals/<plugin-name>/` when adding
 
 Use `.github/skills/evaluate-skills/SKILL.md` for eval work: use when creating, updating, or reviewing Vally evals for plugin skills. Covers `eval.yaml`, fixtures, graders, `expect_skills`, suites, tags, and eval coverage for new or changed skills.
 
+The repo pins Vally and Copilot CLI versions in the root `package.json`. For repo evals, workflows, and repo-local skill instructions, install them with `npm i` locally or `npm ci` in CI, then invoke them via `npx vally` and `npx copilot` rather than `npx -y` with inline versions.
+
 ## Pre-commit checklist
 
 1. Plugin files changed → bump plugin `version` in both `marketplace.json` and `plugin.json`.

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,114 @@
+# Contributing
+
+This repository keeps plugin evals under `evals/` and uses [Vally](https://microsoft.github.io/vally/) to lint specs, run the pull request eval suite, and compare experiment variants.
+
+Run commands from the repository root.
+
+## Prerequisites
+
+Install the pinned CLI dependencies:
+
+```sh
+npm i
+```
+
+Authenticate Copilot once before running evals or experiments:
+
+```sh
+copilot
+# then run /login
+```
+
+## Common development tasks
+
+### Lint eval specs
+
+Use this before sending a change for review:
+
+```sh
+vally lint --eval-spec evals
+```
+
+### Run the pull request eval suite
+
+The `pr` suite is defined in `.vally.yaml` and currently runs eval stimuli tagged with `priority: p0`.
+
+```sh
+vally eval --suite pr --output-dir vally-results --junit
+```
+
+### Run a single eval file
+
+When iterating on one plugin's coverage, point Vally at that spec directly:
+
+```sh
+vally eval --eval-spec evals/security/eval.yaml --output-dir vally-results
+```
+
+## Experiments
+
+Use experiments when you want to compare variants of the same eval. The current
+`pin-github-actions` benchmark compares the scripted skill in this repository
+with the pre-script, instruction-only baseline maintained on the
+`experiments-baseline` branch.
+
+### Experiment layout
+
+- Put experiments under a dedicated directory such as
+  `evals/security/pin-github-actions-usage/`.
+- Keep related helper scripts in that same experiment directory.
+- Add a concise root `package.json` script for each helper, and keep repo helper
+  scripts in ES module format.
+
+### Prepare the baseline worktree
+
+Create the local worktree once:
+
+```sh
+git worktree add .worktrees/experiments-baseline experiments-baseline
+```
+
+Refresh it later as needed:
+
+```sh
+git -C .worktrees/experiments-baseline pull --ff-only
+```
+
+### Preview the experiment plan
+
+```sh
+vally experiment run evals/security/pin-github-actions-usage/experiment.yaml --dry-run
+```
+
+### Run the pin-github-actions usage experiment
+
+```sh
+vally experiment run \
+  evals/security/pin-github-actions-usage/experiment.yaml \
+  --output-dir vally-experiment-results
+```
+
+### Summarize relative cost
+
+Analyze the latest experiment run:
+
+```sh
+npm run pin-github-actions-experiment:compare
+```
+
+Analyze a specific run directory:
+
+```sh
+npm run pin-github-actions-experiment:compare -- vally-experiment-results/2026-06-26T05-33-51-234Z
+```
+
+The experiment writes one directory per variant under the timestamped run folder.
+Compare `results.jsonl` and `run-summary.jsonl` for `scripted-main` and
+`skill-only-baseline` to see whether the scripted skill reduces model calls,
+tokens, or cost. The baseline variant loads its skill from
+`.worktrees/experiments-baseline/`, so keep that branch available locally before
+running the comparison.
+
+## More information
+
+See the [Vally documentation](https://microsoft.github.io/vally/) for the full eval, suite, and experiment reference.
diff --git a/evals/security/pin-github-actions-usage/compare.js b/evals/security/pin-github-actions-usage/compare.js
@@ -0,0 +1,174 @@
+#!/usr/bin/env node
+
+import { readFileSync, readdirSync, existsSync, statSync } from "fs";
+import { join, resolve } from "path";
+
+const DEFAULT_RESULTS_DIR = "vally-experiment-results";
+const PRIMARY_VARIANTS = ["scripted-main", "skill-only-baseline"];
+
+function fail(message) {
+  console.error(message);
+  process.exit(1);
+}
+
+function readJsonLines(filePath) {
+  return readFileSync(filePath, "utf8")
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .map((line) => JSON.parse(line));
+}
+
+function latestRunDir(resultsRoot) {
+  const entries = readdirSync(resultsRoot, { withFileTypes: true })
+    .filter((entry) => entry.isDirectory())
+    .map((entry) => entry.name)
+    .sort();
+
+  if (entries.length === 0) {
+    fail(`No experiment runs found in ${resultsRoot}`);
+  }
+
+  return join(resultsRoot, entries[entries.length - 1]);
+}
+
+function resolveRunDir(argPath) {
+  const candidate = argPath || DEFAULT_RESULTS_DIR;
+  const resolved = resolve(candidate);
+
+  if (!existsSync(resolved)) {
+    fail(`Experiment output path not found: ${candidate}`);
+  }
+
+  if (!statSync(resolved).isDirectory()) {
+    fail(`Experiment output path is not a directory: ${candidate}`);
+  }
+
+  if (existsSync(join(resolved, "report.md"))) {
+    return resolved;
+  }
+
+  return latestRunDir(resolved);
+}
+
+function collectVariantMetrics(runDir, variant) {
+  const resultsPath = join(runDir, variant, "results.jsonl");
+  if (!existsSync(resultsPath)) {
+    fail(`Missing results for variant '${variant}' in ${runDir}`);
+  }
+
+  const trials = readJsonLines(resultsPath).filter((row) => row.type === "trial-result");
+  if (trials.length === 0) {
+    fail(`No trial results found for variant '${variant}' in ${resultsPath}`);
+  }
+
+  const totals = {
+    totalTokens: 0,
+    inputTokens: 0,
+    outputTokens: 0,
+    modelCalls: 0,
+    usdCost: 0,
+  };
+  let hasUsdCost = false;
+
+  for (const trial of trials) {
+    const metrics = trial.trajectory?.metrics || {};
+    const tokenUsage = metrics.tokenUsage || {};
+
+    totals.totalTokens += tokenUsage.totalTokens || 0;
+    totals.inputTokens += tokenUsage.inputTokens || 0;
+    totals.outputTokens += tokenUsage.outputTokens || 0;
+    totals.modelCalls += tokenUsage.callCount || 0;
+
+    const explicitCost = metrics.costUsd ?? tokenUsage.costUsd ?? trial.costUsd;
+    if (typeof explicitCost === "number") {
+      totals.usdCost += explicitCost;
+      hasUsdCost = true;
+    }
+  }
+
+  return { trials: trials.length, totals, hasUsdCost };
+}
+
+function formatNumber(value) {
+  if (typeof value !== "number" || Number.isNaN(value)) {
+    return "—";
+  }
+
+  return new Intl.NumberFormat("en-US").format(value);
+}
+
+function formatPercent(value) {
+  if (!Number.isFinite(value)) {
+    return "—";
+  }
+
+  const sign = value > 0 ? "+" : value < 0 ? "-" : "";
+  return `${sign}${Math.abs(value).toFixed(2)}%`;
+}
+
+function formatDelta(delta, baseline) {
+  if (!Number.isFinite(delta) || !Number.isFinite(baseline) || baseline === 0) {
+    return "—";
+  }
+
+  return formatPercent((delta / baseline) * 100);
+}
+
+function compareMetric(scripted, baseline, key) {
+  const delta = scripted.totals[key] - baseline.totals[key];
+  return {
+    scripted: scripted.totals[key],
+    baseline: baseline.totals[key],
+    relative: formatDelta(delta, baseline.totals[key]),
+  };
+}
+
+function metricRows(scripted, baseline) {
+  const rows = [
+    ["Total tokens", compareMetric(scripted, baseline, "totalTokens")],
+    ["Input tokens", compareMetric(scripted, baseline, "inputTokens")],
+    ["Output tokens", compareMetric(scripted, baseline, "outputTokens")],
+    ["Model calls", compareMetric(scripted, baseline, "modelCalls")],
+  ];
+
+  if (scripted.hasUsdCost || baseline.hasUsdCost) {
+    rows.unshift(["USD cost", compareMetric(scripted, baseline, "usdCost")]);
+  }
+
+  return rows;
+}
+
+function printMarkdownTable(runDir, scripted, baseline) {
+  console.log(`Run: ${runDir}`);
+  console.log("");
+  console.log(`Trials per variant: ${scripted.trials}`);
+  console.log("");
+  console.log(`| Metric across ${scripted.trials} trials | Scripted | Skill-only baseline | Delta (scripted vs baseline) |`);
+  console.log("| --- | --- | --- | --- |");
+
+  for (const [label, values] of metricRows(scripted, baseline)) {
+    console.log(
+      `| ${label} | ${formatNumber(values.scripted)} | ${formatNumber(values.baseline)} | ${values.relative} |`
+    );
+  }
+}
+
+function main() {
+  const runDir = resolveRunDir(process.argv[2]);
+  const variants = readdirSync(runDir, { withFileTypes: true })
+    .filter((entry) => entry.isDirectory())
+    .map((entry) => entry.name);
+
+  for (const variant of PRIMARY_VARIANTS) {
+    if (!variants.includes(variant)) {
+      fail(`Expected variant '${variant}' in ${runDir}. Found variants: ${variants.sort().join(", ")}`);
+    }
+  }
+
+  const scripted = collectVariantMetrics(runDir, "scripted-main");
+  const baseline = collectVariantMetrics(runDir, "skill-only-baseline");
+  printMarkdownTable(runDir, scripted, baseline);
+}
+
+main();