From 02e6d874e0bfdaae48456b2ffa07a04ab6227dd7 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 25 Jun 2026 16:34:17 -0500
Subject: [PATCH 1/5] update sdk with new adds

---
 CHANGELOG.md                                  |  16 +
 nucleus/__init__.py                           | 253 +++++++++++++-
 nucleus/connection.py                         |   5 +
 nucleus/data_transfer_object/evaluation_v2.py |   4 +
 nucleus/dataset.py                            |  15 +
 nucleus/evaluation_v2.py                      | 167 +++++++--
 nucleus/evaluation_v2_exclusions.py           | 112 ++++++
 nucleus/evaluation_v2_preset.py               | 111 ++++++
 pyproject.toml                                |   2 +-
 tests/test_evaluation_v2.py                   | 196 +++++++++--
 tests/test_evaluation_v2_presets.py           | 327 ++++++++++++++++++
 11 files changed, 1146 insertions(+), 62 deletions(-)
 create mode 100644 nucleus/evaluation_v2_exclusions.py
 create mode 100644 nucleus/evaluation_v2_preset.py
 create mode 100644 tests/test_evaluation_v2_presets.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4212614c..56345247 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.18.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.9) - 2026-06-25
+
+### Added
+- **Evaluations V2 slice scoping and exclusion rules.** `create_evaluation_v2()` accepts `slice_id` (restrict the evaluation to a slice's items) and `exclusion_rules` (drop items/annotations before metrics are computed) via the new `MetadataExclusionRule`, `LabelExclusionRule`, and `BoxAreaExclusionRule` types (or equivalent dicts). The `EvaluationV2` resource exposes `slice_id`, `exclusion_rules`, and `exclusion_stats`. `EvaluationV2FilterArgs` gains `gt_area_range` (filter by ground-truth box area, e.g. COCO small/medium/large bands) and `slice_ids`, applied by both `charts()` and `examples()`.
+- **Evaluation V2 presets.** Save and reuse evaluation configurations (`name` + `allowed_label_matches` + `exclusion_rules`) via `NucleusClient.list_evaluation_v2_presets()`, `create_evaluation_v2_preset()`, `update_evaluation_v2_preset()`, and `delete_evaluation_v2_preset()`, plus the new `EvaluationV2Preset` resource (with `update()` / `delete()`). Apply a preset directly when creating an evaluation: `create_evaluation_v2(model_run_id, preset=preset)` seeds the matches and rules (explicit arguments override the preset).
+- `create_evaluation_v2()` accepts `only_items_with_predictions` to restrict the evaluation to items that have at least one prediction.
+- **Batch create.** `create_evaluations_v2_batch()` creates one evaluation per `(model_run_id, slice_id)` pair with a shared configuration, running concurrently and returning a `BatchEvaluationResult` per job (capturing the created evaluation or the per-job error).
+- **Cancel & retry.** `EvaluationV2.cancel()` stops a running evaluation; `EvaluationV2.retry()` re-runs a failed one, reusing its slice/matches/exclusion rules.
+- `Dataset.evaluation_label_schema()` returns the dataset's ground-truth and prediction label vocabularies (`gt_labels` / `prediction_labels`) for building label matches and label exclusion rules.
+
+### Changed
+- `EvaluationV2.examples()` now treats `match_type` as optional — omit it to return examples of all match types.
+
+### Fixed
+- `EvaluationV2.charts()` issues a `POST` (matching the backend route) instead of a `GET` with a query string, which did not reach the server.
+
 ## [0.18.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.8) - 2026-06-17
 
 ### Fixed
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index 12e3c540..a8591c77 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -3,6 +3,7 @@
 __all__ = [
     "AsyncJob",
     "AllowedLabelMatch",
+    "BatchEvaluationResult",
     "EmbeddingsExportJob",
     "BoxAnnotation",
     "DeduplicationJob",
@@ -24,7 +25,11 @@
     "EvaluationV2ExamplesPage",
     "EvaluationV2FilterArgs",
     "EvaluationV2MatchExample",
+    "EvaluationV2Preset",
     "EvaluationV2Status",
+    "MetadataExclusionRule",
+    "LabelExclusionRule",
+    "BoxAreaExclusionRule",
     "Frame",
     "Keypoint",
     "KeypointsAnnotation",
@@ -57,6 +62,7 @@
 import datetime
 import os
 import warnings
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import requests
@@ -161,7 +167,19 @@
     NotFoundError,
     NucleusAPIError,
 )
-from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
+from .evaluation_v2 import (
+    AllowedLabelMatch,
+    BatchEvaluationResult,
+    EvaluationV2,
+    EvaluationV2Status,
+)
+from .evaluation_v2_exclusions import (
+    BoxAreaExclusionRule,
+    EvaluationV2ExclusionRule,
+    LabelExclusionRule,
+    MetadataExclusionRule,
+)
+from .evaluation_v2_preset import _UNSET, EvaluationV2Preset
 from .job import CustomerJobTypes
 from .local_deduplication import (
     LocalDeduplicationResult,
@@ -902,6 +920,12 @@ def create_evaluation_v2(
         name: Optional[str] = None,
         allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
         allowed_label_matches_id: Optional[str] = None,
+        slice_id: Optional[str] = None,
+        exclusion_rules: Optional[
+            List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
+        ] = None,
+        only_items_with_predictions: bool = False,
+        preset: Optional[EvaluationV2Preset] = None,
     ) -> EvaluationV2:
         """Create an evaluation for a model run.
 
@@ -914,10 +938,30 @@ def create_evaluation_v2(
             name: Optional display name.
             allowed_label_matches: Optional label pairs to treat as matches.
             allowed_label_matches_id: Optional id of a saved label-match configuration.
+            slice_id: Optional slice id (``slc_*``) to scope the evaluation to the
+                items in that slice. Must belong to the model run's dataset.
+            exclusion_rules: Optional rules that drop items/annotations before metrics
+                are computed. Each entry is a
+                :class:`~nucleus.evaluation_v2_exclusions.MetadataExclusionRule`,
+                :class:`~nucleus.evaluation_v2_exclusions.LabelExclusionRule`, or
+                :class:`~nucleus.evaluation_v2_exclusions.BoxAreaExclusionRule`
+                (or an equivalent plain dict). Per-rule validation happens server-side;
+                a malformed rule rejects the whole request with a descriptive error.
+            only_items_with_predictions: If ``True``, restrict the evaluation to
+                items that have at least one model prediction.
+            preset: Optional :class:`EvaluationV2Preset` whose
+                ``allowed_label_matches`` and ``exclusion_rules`` seed this
+                evaluation. Explicit ``allowed_label_matches`` / ``exclusion_rules``
+                arguments take precedence over the preset's values.
 
         Returns:
             :class:`EvaluationV2`: The created evaluation.
         """
+        if preset is not None:
+            if allowed_label_matches is None:
+                allowed_label_matches = preset.allowed_label_matches
+            if exclusion_rules is None and preset.exclusion_rules is not None:
+                exclusion_rules = list(preset.exclusion_rules)
         payload: Dict[str, Any] = {}
         if name is not None:
             payload["name"] = name
@@ -927,6 +971,15 @@ def create_evaluation_v2(
             ]
         if allowed_label_matches_id is not None:
             payload["allowed_label_matches_id"] = allowed_label_matches_id
+        if slice_id is not None:
+            payload["sliceId"] = slice_id
+        if exclusion_rules is not None:
+            payload["exclusionRules"] = [
+                rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule
+                for rule in exclusion_rules
+            ]
+        if only_items_with_predictions:
+            payload["onlyItemsWithPredictions"] = True
         result = self.make_request(
             payload, f"modelRun/{model_run_id}/evaluationsV2"
         )
@@ -937,6 +990,96 @@ def create_evaluation_v2(
             )
         return self.get_evaluation_v2(str(eval_id))
 
+    def create_evaluations_v2_batch(
+        self,
+        model_run_ids: List[str],
+        *,
+        slice_ids: Optional[List[Optional[str]]] = None,
+        name_prefix: Optional[str] = None,
+        allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
+        allowed_label_matches_id: Optional[str] = None,
+        exclusion_rules: Optional[
+            List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
+        ] = None,
+        only_items_with_predictions: bool = False,
+        preset: Optional[EvaluationV2Preset] = None,
+        max_workers: int = 4,
+    ) -> List[BatchEvaluationResult]:
+        """Create many evaluations at once, sharing one configuration.
+
+        Mirrors the batch-create flow in the UI: one evaluation is created for
+        every ``(model_run_id, slice_id)`` pair (the cross-product of
+        ``model_run_ids`` and ``slice_ids``), all sharing the same matches,
+        exclusion rules, and options. Jobs run concurrently and failures are
+        captured per job rather than aborting the batch.
+
+        Parameters:
+            model_run_ids: Model run ids (``run_*``) to evaluate.
+            slice_ids: Slice ids (``slc_*``) to scope each evaluation to. Use
+                ``None`` within the list for a whole-dataset evaluation. Defaults
+                to ``[None]`` (whole dataset for every run).
+            name_prefix: Optional name prefix; the run id and/or slice id are
+                appended to keep batch names unique.
+            allowed_label_matches: Shared label-match pairs (see
+                :meth:`create_evaluation_v2`).
+            allowed_label_matches_id: Shared saved label-match config id.
+            exclusion_rules: Shared exclusion rules.
+            only_items_with_predictions: Shared "only items with predictions" flag.
+            preset: Optional preset seeding matches/rules for every job.
+            max_workers: Maximum concurrent create requests (default 4).
+
+        Returns:
+            List of :class:`BatchEvaluationResult`, in input order — each holds
+            the created :class:`EvaluationV2` or the error for that job.
+        """
+        if not model_run_ids:
+            return []
+        targets: List[Optional[str]] = (
+            list(slice_ids) if slice_ids is not None else [None]
+        )
+        jobs: List[Tuple[str, Optional[str]]] = [
+            (run, sl) for run in model_run_ids for sl in targets
+        ]
+
+        def _name(run: str, sl: Optional[str]) -> Optional[str]:
+            if name_prefix is None:
+                return None
+            parts = [name_prefix]
+            if len(model_run_ids) > 1:
+                parts.append(run)
+            if sl is not None:
+                parts.append(sl)
+            return " — ".join(parts)
+
+        results: List[Optional[BatchEvaluationResult]] = [None] * len(jobs)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_idx = {
+                executor.submit(
+                    self.create_evaluation_v2,
+                    run,
+                    name=_name(run, sl),
+                    allowed_label_matches=allowed_label_matches,
+                    allowed_label_matches_id=allowed_label_matches_id,
+                    slice_id=sl,
+                    exclusion_rules=exclusion_rules,
+                    only_items_with_predictions=only_items_with_predictions,
+                    preset=preset,
+                ): idx
+                for idx, (run, sl) in enumerate(jobs)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                run, sl = jobs[idx]
+                result = BatchEvaluationResult(
+                    model_run_id=run, slice_id=sl, name=_name(run, sl)
+                )
+                try:
+                    result.evaluation = future.result()
+                except Exception as exc:  # noqa: BLE001 - reported per job
+                    result.error = str(exc)
+                results[idx] = result
+        return [r for r in results if r is not None]
+
     def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
         """Get an evaluation by id.
 
@@ -965,6 +1108,111 @@ def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]:
             )
         return [EvaluationV2.from_json(r, self) for r in rows]
 
+    def list_evaluation_v2_presets(self) -> List[EvaluationV2Preset]:
+        """List the current user's saved Evaluation V2 presets.
+
+        Returns:
+            List of :class:`EvaluationV2Preset` (presets are private per user).
+        """
+        rows = self.get("evaluationV2Presets")
+        if not isinstance(rows, list):
+            raise RuntimeError(
+                f"Unexpected list evaluation V2 presets response: {rows!r}"
+            )
+        return [EvaluationV2Preset.from_json(r, self) for r in rows]
+
+    def create_evaluation_v2_preset(
+        self,
+        name: str,
+        *,
+        allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
+        exclusion_rules: Optional[
+            List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
+        ] = None,
+    ) -> EvaluationV2Preset:
+        """Create a saved Evaluation V2 preset.
+
+        Parameters:
+            name: Preset name. Must be non-empty and unique among the user's
+                presets.
+            allowed_label_matches: Optional label pairs to treat as matches.
+            exclusion_rules: Optional rules that drop items/annotations (same
+                types accepted by :meth:`create_evaluation_v2`).
+
+        Returns:
+            :class:`EvaluationV2Preset`: The created preset.
+        """
+        payload: Dict[str, Any] = {"name": name}
+        if allowed_label_matches is not None:
+            payload["allowedLabelMatches"] = [
+                m.to_api_dict() for m in allowed_label_matches
+            ]
+        if exclusion_rules is not None:
+            payload["exclusionRules"] = [
+                rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule
+                for rule in exclusion_rules
+            ]
+        data = self.post(payload, "evaluationV2Presets")
+        return EvaluationV2Preset.from_json(data, self)
+
+    def update_evaluation_v2_preset(
+        self,
+        preset_id: str,
+        *,
+        name: Any = _UNSET,
+        allowed_label_matches: Any = _UNSET,
+        exclusion_rules: Any = _UNSET,
+    ) -> EvaluationV2Preset:
+        """Update a saved Evaluation V2 preset.
+
+        Only the fields you pass are changed. Passing ``exclusion_rules=None``
+        clears the rules; omitting an argument leaves that field unchanged.
+
+        Parameters:
+            preset_id: Preset id (``prev_*``). Must be owned by the caller.
+            name: Optional new name.
+            allowed_label_matches: Optional new label-match list.
+            exclusion_rules: Optional new exclusion rules, or ``None`` to clear.
+
+        Returns:
+            :class:`EvaluationV2Preset`: The updated preset.
+        """
+        payload: Dict[str, Any] = {}
+        if name is not _UNSET:
+            payload["name"] = name
+        if allowed_label_matches is not _UNSET:
+            payload["allowedLabelMatches"] = (
+                None
+                if allowed_label_matches is None
+                else [m.to_api_dict() for m in allowed_label_matches]
+            )
+        if exclusion_rules is not _UNSET:
+            payload["exclusionRules"] = (
+                None
+                if exclusion_rules is None
+                else [
+                    rule.to_api_dict()
+                    if hasattr(rule, "to_api_dict")
+                    else rule
+                    for rule in exclusion_rules
+                ]
+            )
+        data = self.patch(payload, f"evaluationV2Presets/{preset_id}")
+        return EvaluationV2Preset.from_json(data, self)
+
+    def delete_evaluation_v2_preset(self, preset_id: str) -> None:
+        """Delete a saved Evaluation V2 preset.
+
+        Parameters:
+            preset_id: Preset id (``prev_*``). Must be owned by the caller.
+        """
+        self.make_request(
+            {},
+            f"evaluationV2Presets/{preset_id}",
+            requests_command=requests.delete,
+            return_raw_response=True,
+        )
+
     @deprecated(msg="Prefer calling Dataset.info() directly.")
     def dataset_info(self, dataset_id: str):
         dataset = self.get_dataset(dataset_id)
@@ -1316,6 +1564,9 @@ def delete(self, route: str):
     def get(self, route: str):
         return self.connection.get(route)
 
+    def patch(self, payload: dict, route: str):
+        return self.connection.patch(payload, route)
+
     def post(self, payload: dict, route: str):
         return self.connection.post(payload, route)
 
diff --git a/nucleus/connection.py b/nucleus/connection.py
index 467600a9..050f12ff 100644
--- a/nucleus/connection.py
+++ b/nucleus/connection.py
@@ -50,6 +50,11 @@ def delete(self, route: str):
     def get(self, route: str):
         return self.make_request({}, route, requests_command=requests.get)
 
+    def patch(self, payload: dict, route: str):
+        return self.make_request(
+            payload, route, requests_command=requests.patch
+        )
+
     def post(self, payload: dict, route: str):
         return self.make_request(
             payload, route, requests_command=requests.post
diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py
index a1abb443..b2f11da7 100644
--- a/nucleus/data_transfer_object/evaluation_v2.py
+++ b/nucleus/data_transfer_object/evaluation_v2.py
@@ -43,9 +43,11 @@ class MetadataPredicate(DictCompatibleModel):
     "gt_labels": "gtLabels",
     "item_metadata": "itemMetadata",
     "prediction_metadata": "predictionMetadata",
+    "gt_area_range": "gtAreaRange",
     "label_equality": "labelEquality",
     "has_ground_truth": "hasGroundTruth",
     "tide_background": "tideBackground",
+    "slice_ids": "sliceIds",
 }
 
 
@@ -58,9 +60,11 @@ class EvaluationV2FilterArgs(DictCompatibleModel):
     gt_labels: Optional[List[str]] = None
     item_metadata: Optional[List[MetadataPredicate]] = None
     prediction_metadata: Optional[List[MetadataPredicate]] = None
+    gt_area_range: Optional[RangeNum] = None
     label_equality: Optional[Literal["EQ", "NEQ"]] = None
     has_ground_truth: Optional[bool] = None
     tide_background: Optional[bool] = None
+    slice_ids: Optional[List[str]] = None
 
     def to_api_filters(self) -> Dict[str, Any]:
         """Return filters as a dict ready for API requests."""
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
index f5efee54..6f705c81 100644
--- a/nucleus/dataset.py
+++ b/nucleus/dataset.py
@@ -228,6 +228,21 @@ def slices(self) -> List[Slice]:
         )
         return [Slice.from_request(info, self._client) for info in response]
 
+    def evaluation_label_schema(self) -> Dict[str, List[str]]:
+        """Ground-truth and prediction label vocabularies for this dataset.
+
+        Useful for building :meth:`NucleusClient.create_evaluation_v2`
+        ``allowed_label_matches`` and label exclusion rules without guessing
+        label names. Mirrors the label lists shown in the Create Evaluation UI.
+
+        Returns:
+            A dict with ``"gt_labels"`` (ground-truth annotation labels) and
+            ``"prediction_labels"`` (model prediction labels).
+        """
+        return self._client.make_request(
+            {}, f"dataset/{self.id}/labelSchema", requests.get
+        )
+
     @property
     def embedding_indexes(self) -> List[EmbeddingIndex]:
         """Gets all the embedding indexes belonging to this Dataset."""
diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py
index 43f8a03c..aa90281b 100644
--- a/nucleus/evaluation_v2.py
+++ b/nucleus/evaluation_v2.py
@@ -7,7 +7,6 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
-from urllib.parse import urlencode
 
 import requests
 
@@ -37,6 +36,23 @@ class EvaluationV2Status(str, Enum):
 }
 
 
+def _parse_json_field(value: Any) -> Optional[Any]:
+    """Normalize a JSONB column that may arrive as a string or already parsed.
+
+    The REST ``GET``/``LIST`` evaluation endpoints return raw DB rows, so the
+    ``exclusion_rules`` / ``exclusion_stats`` jsonb columns can come back either
+    already decoded (dict/list) or as a JSON string depending on the driver.
+    """
+    if value is None or isinstance(value, (dict, list)):
+        return value
+    if isinstance(value, str):
+        try:
+            return json.loads(value)
+        except (ValueError, TypeError):
+            return None
+    return value
+
+
 @dataclass
 class AllowedLabelMatch:
     """Ground-truth and prediction label pair that counts as a match."""
@@ -51,6 +67,56 @@ def to_api_dict(self) -> Dict[str, str]:
         }
 
 
+def parse_allowed_label_matches(
+    raw_matches: Any,
+) -> Optional[List[AllowedLabelMatch]]:
+    """Parse an ``allowed_label_matches`` array from an API payload.
+
+    Accepts both the camelCase (``groundTruthLabel`` / ``modelPredictionLabel``)
+    and snake_case shapes the backend may return, and drops malformed entries.
+    """
+    if not isinstance(raw_matches, list):
+        return None
+    matches: List[AllowedLabelMatch] = []
+    for m in raw_matches:
+        if not isinstance(m, dict):
+            continue
+        gt = m.get("groundTruthLabel")
+        if gt is None:
+            gt = m.get("ground_truth_label")
+        mp = m.get("modelPredictionLabel")
+        if mp is None:
+            mp = m.get("model_prediction_label")
+        if gt is not None and mp is not None:
+            matches.append(
+                AllowedLabelMatch(
+                    ground_truth_label=str(gt),
+                    model_prediction_label=str(mp),
+                )
+            )
+    return matches
+
+
+@dataclass
+class BatchEvaluationResult:
+    """Outcome of one job in a batch create call.
+
+    ``evaluation`` is set on success; ``error`` holds the error message on
+    failure. Use :attr:`succeeded` to filter, and re-run the failed jobs by
+    feeding their ``model_run_id`` / ``slice_id`` back into a new batch call.
+    """
+
+    model_run_id: str
+    slice_id: Optional[str] = None
+    name: Optional[str] = None
+    evaluation: Optional["EvaluationV2"] = None
+    error: Optional[str] = None
+
+    @property
+    def succeeded(self) -> bool:
+        return self.evaluation is not None
+
+
 @dataclass
 class EvaluationV2:
     """An Evaluation V2 run for a model run."""
@@ -66,6 +132,9 @@ class EvaluationV2:
     allowed_label_matches_id: Optional[str] = None
     allowed_label_matches: Optional[List[AllowedLabelMatch]] = None
     allowed_label_matches_name: Optional[str] = None
+    slice_id: Optional[str] = None
+    exclusion_rules: Optional[List[Dict[str, Any]]] = None
+    exclusion_stats: Optional[Dict[str, Any]] = None
     _client: Optional["NucleusClient"] = field(repr=False, default=None)
 
     @classmethod
@@ -74,26 +143,9 @@ def from_json(
         payload: Dict[str, Any],
         client: Optional["NucleusClient"] = None,
     ) -> "EvaluationV2":
-        raw_matches = payload.get("allowed_label_matches")
-        matches: Optional[List[AllowedLabelMatch]] = None
-        if isinstance(raw_matches, list):
-            matches = []
-            for m in raw_matches:
-                if not isinstance(m, dict):
-                    continue
-                gt = m.get("groundTruthLabel")
-                if gt is None:
-                    gt = m.get("ground_truth_label")
-                mp = m.get("modelPredictionLabel")
-                if mp is None:
-                    mp = m.get("model_prediction_label")
-                if gt is not None and mp is not None:
-                    matches.append(
-                        AllowedLabelMatch(
-                            ground_truth_label=str(gt),
-                            model_prediction_label=str(mp),
-                        )
-                    )
+        matches = parse_allowed_label_matches(
+            payload.get("allowed_label_matches")
+        )
 
         return cls(
             id=str(payload["id"]),
@@ -109,6 +161,9 @@ def from_json(
             allowed_label_matches_name=payload.get(
                 "allowed_label_matches_name"
             ),
+            slice_id=payload.get("slice_id"),
+            exclusion_rules=_parse_json_field(payload.get("exclusion_rules")),
+            exclusion_stats=_parse_json_field(payload.get("exclusion_stats")),
             _client=client,
         )
 
@@ -170,6 +225,45 @@ def delete(self) -> None:
             return_raw_response=True,
         )
 
+    def cancel(self) -> "EvaluationV2":
+        """Cancel this evaluation if it is still running.
+
+        Stops the evaluation and sets its status to ``cancelled``. Finished
+        evaluations cannot be cancelled (use :meth:`delete` to archive them).
+
+        Returns:
+            self, refreshed with the post-cancel status.
+        """
+        if self._client is None:
+            raise RuntimeError("EvaluationV2 has no client.")
+        self._client.make_request(
+            {},
+            f"evaluationsV2/{self.id}/cancel",
+            requests_command=requests.post,
+            return_raw_response=True,
+        )
+        return self.refresh()
+
+    def retry(self) -> "EvaluationV2":
+        """Retry this evaluation if it failed.
+
+        Creates a new evaluation for the same model run, reusing this
+        evaluation's slice, allowed-label-matches, and exclusion rules. Only
+        ``failed`` evaluations can be retried.
+
+        Returns:
+            :class:`EvaluationV2`: The newly created (retry) evaluation.
+        """
+        if self._client is None:
+            raise RuntimeError("EvaluationV2 has no client.")
+        result = self._client.post({}, f"evaluationsV2/{self.id}/retry")
+        eval_id = result.get("evaluation_id")
+        if not eval_id:
+            raise RuntimeError(
+                f"Unexpected retry evaluation V2 response: {result}"
+            )
+        return self._client.get_evaluation_v2(str(eval_id))
+
     def charts(
         self,
         iou_threshold: float = 0.5,
@@ -190,24 +284,20 @@ def charts(
         """
         if self._client is None:
             raise RuntimeError("EvaluationV2 has no client.")
-        params: Dict[str, str] = {}
-        params["iouThreshold"] = str(iou_threshold)
+        payload: Dict[str, Any] = {"iouThreshold": iou_threshold}
         if filters is not None:
             if isinstance(filters, EvaluationV2FilterArgs):
-                filt_dict = filters.to_api_filters()
+                payload["filters"] = filters.to_api_filters()
             else:
-                filt_dict = filters
-            params["filters"] = json.dumps(filt_dict)
+                payload["filters"] = filters
         if query:
-            params["query"] = query
-        qs = urlencode(params)
-        route = f"evaluationsV2/{self.id}/charts?{qs}"
-        data = self._client.get(route)
+            payload["query"] = query
+        data = self._client.post(payload, f"evaluationsV2/{self.id}/charts")
         return EvaluationV2Charts.parse_obj(data)
 
     def examples(
         self,
-        match_type: str,
+        match_type: Optional[str] = None,
         limit: int = 50,
         offset: int = 0,
         sort_by: Optional[str] = None,
@@ -217,14 +307,16 @@ def examples(
         ] = None,
         query: Optional[str] = None,
     ) -> EvaluationV2ExamplesPage:
-        """Return paginated true-positive, false-positive, or false-negative examples.
+        """Return paginated match examples, optionally filtered by match type.
 
         Parameters:
-            match_type: ``"TP"``, ``"FP"``, or ``"FN"``.
-            limit: Page size (default 50).
+            match_type: ``"TP"``, ``"FP"``, or ``"FN"``. Omit (or ``None``) to
+                return examples of all match types.
+            limit: Page size (default 50, max 100).
             offset: Row offset for pagination.
-            sort_by: Optional field to sort by.
-            sort_order: Optional sort direction (e.g. ``"asc"`` or ``"desc"``).
+            sort_by: Optional field to sort by — one of ``"confidence"``,
+                ``"iou"``, ``"dataset_item_id"``, ``"gt_area"``.
+            sort_order: Optional sort direction (``"ASC"`` or ``"DESC"``).
             filters: Optional filters (:class:`EvaluationV2FilterArgs` or dict).
             query: Optional query string to narrow results.
 
@@ -234,10 +326,11 @@ def examples(
         if self._client is None:
             raise RuntimeError("EvaluationV2 has no client.")
         payload: Dict[str, Any] = {
-            "match_type": match_type,
             "limit": limit,
             "offset": offset,
         }
+        if match_type is not None:
+            payload["match_type"] = match_type
         if sort_by is not None:
             payload["sort_by"] = sort_by
         if sort_order is not None:
diff --git a/nucleus/evaluation_v2_exclusions.py b/nucleus/evaluation_v2_exclusions.py
new file mode 100644
index 00000000..c529f2e1
--- /dev/null
+++ b/nucleus/evaluation_v2_exclusions.py
@@ -0,0 +1,112 @@
+"""Exclusion rules for Evaluation V2 creation.
+
+Mirrors ``packages/shared/src/nucleus/evaluationV2Exclusions.ts`` on the backend.
+These rules drop items/annotations from an evaluation before metrics are computed.
+
+The per-rule shape is validated server-side at create time
+(``parseEvaluationV2ExclusionRulesWithDiagnostics``), which reports exactly which
+rules were rejected and why — so these classes only need to serialize correctly.
+
+Pass instances (or equivalent plain dicts) to
+:meth:`nucleus.NucleusClient.create_evaluation_v2` via ``exclusion_rules``::
+
+    client.create_evaluation_v2(
+        model_run_id,
+        exclusion_rules=[
+            BoxAreaExclusionRule(scope="annotation", target="groundTruth", min=1024),
+            LabelExclusionRule(scope="item", target="prediction", labels=["ignore"]),
+            MetadataExclusionRule(key="is_dark", op="EQ", value=True),
+        ],
+    )
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+
+# String literals are sent as values (not keys), so the server's request-body
+# camelcaser preserves them verbatim — emit them exactly as the backend expects.
+ExclusionScope = str  # "item" | "annotation"
+ExclusionTarget = str  # "groundTruth" | "prediction"
+MetadataOp = str  # "EQ" | "IN" | "GT" | "LT"
+
+
+@dataclass
+class MetadataExclusionRule:
+    """Exclude whole items whose item-metadata ``key`` matches ``value`` under ``op``.
+
+    ``scope`` is always ``"item"`` for metadata rules.
+    """
+
+    key: str
+    op: MetadataOp
+    value: Any
+    scope: ExclusionScope = "item"
+
+    def to_api_dict(self) -> Dict[str, Any]:
+        return {
+            "type": "metadata",
+            "scope": self.scope,
+            "key": self.key,
+            "op": self.op,
+            "value": self.value,
+        }
+
+
+@dataclass
+class LabelExclusionRule:
+    """Exclude annotations/predictions (or whole items) carrying any of ``labels``.
+
+    Parameters:
+        scope: ``"item"`` (drop the whole item if any annotation matches) or
+            ``"annotation"`` (drop only matching annotations).
+        target: ``"groundTruth"`` or ``"prediction"`` — which side to filter.
+        labels: Labels to exclude.
+    """
+
+    scope: ExclusionScope
+    target: ExclusionTarget
+    labels: List[str] = field(default_factory=list)
+
+    def to_api_dict(self) -> Dict[str, Any]:
+        return {
+            "type": "labels",
+            "scope": self.scope,
+            "target": self.target,
+            "labels": list(self.labels),
+        }
+
+
+@dataclass
+class BoxAreaExclusionRule:
+    """Exclude boxes whose pixel area falls outside ``[min, max]`` (at least one bound required).
+
+    Parameters:
+        scope: ``"item"`` or ``"annotation"``.
+        target: ``"groundTruth"`` or ``"prediction"``.
+        min: Minimum pixel area (inclusive lower bound), or ``None``.
+        max: Maximum pixel area (inclusive upper bound), or ``None``.
+    """
+
+    scope: ExclusionScope
+    target: ExclusionTarget
+    min: Optional[float] = None
+    max: Optional[float] = None
+
+    def to_api_dict(self) -> Dict[str, Any]:
+        out: Dict[str, Any] = {
+            "type": "boxArea",
+            "scope": self.scope,
+            "target": self.target,
+        }
+        if self.min is not None:
+            out["min"] = self.min
+        if self.max is not None:
+            out["max"] = self.max
+        return out
+
+
+EvaluationV2ExclusionRule = Union[
+    MetadataExclusionRule,
+    LabelExclusionRule,
+    BoxAreaExclusionRule,
+]
diff --git a/nucleus/evaluation_v2_preset.py b/nucleus/evaluation_v2_preset.py
new file mode 100644
index 00000000..36bf3630
--- /dev/null
+++ b/nucleus/evaluation_v2_preset.py
@@ -0,0 +1,111 @@
+"""Evaluation V2 presets — saved, reusable evaluation configurations.
+
+A preset bundles a ``name`` with ``allowed_label_matches`` and ``exclusion_rules``
+so the same configuration can be applied across many evaluations. Presets are
+private to the creating user.
+
+Mirrors the ``/v1/nucleus/evaluationV2Presets`` REST endpoints on the backend.
+Create and manage presets via :class:`~nucleus.NucleusClient`::
+
+    preset = client.create_evaluation_v2_preset(
+        "vehicles",
+        allowed_label_matches=[AllowedLabelMatch("car", "vehicle")],
+        exclusion_rules=[LabelExclusionRule(scope="item", target="prediction", labels=["ignore"])],
+    )
+    client.create_evaluation_v2(model_run_id, preset=preset)
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from nucleus.evaluation_v2 import (
+    AllowedLabelMatch,
+    _parse_json_field,
+    parse_allowed_label_matches,
+)
+
+if TYPE_CHECKING:
+    from nucleus import NucleusClient
+
+
+# Sentinel distinguishing "argument omitted" from an explicit ``None`` (which,
+# for ``exclusion_rules`` on update, means "clear the rules").
+class _Unset:
+    def __repr__(self) -> str:  # pragma: no cover - cosmetic
+        return "<UNSET>"
+
+
+_UNSET = _Unset()
+
+
+@dataclass
+class EvaluationV2Preset:
+    """A saved Evaluation V2 configuration owned by the current user."""
+
+    id: str
+    name: str
+    allowed_label_matches: Optional[List[AllowedLabelMatch]] = None
+    exclusion_rules: Optional[List[Dict[str, Any]]] = None
+    created_by_user_id: Optional[str] = None
+    created_at: Optional[str] = None
+    updated_at: Optional[str] = None
+    deleted_at: Optional[str] = None
+    _client: Optional["NucleusClient"] = field(repr=False, default=None)
+
+    @classmethod
+    def from_json(
+        cls,
+        payload: Dict[str, Any],
+        client: Optional["NucleusClient"] = None,
+    ) -> "EvaluationV2Preset":
+        return cls(
+            id=str(payload["id"]),
+            name=str(payload["name"]),
+            allowed_label_matches=parse_allowed_label_matches(
+                payload.get("allowed_label_matches")
+            ),
+            exclusion_rules=_parse_json_field(payload.get("exclusion_rules")),
+            created_by_user_id=payload.get("created_by_user_id"),
+            created_at=payload.get("created_at"),
+            updated_at=payload.get("updated_at"),
+            deleted_at=payload.get("deleted_at"),
+            _client=client,
+        )
+
+    def update(
+        self,
+        *,
+        name: Any = _UNSET,
+        allowed_label_matches: Any = _UNSET,
+        exclusion_rules: Any = _UNSET,
+    ) -> "EvaluationV2Preset":
+        """Update this preset in place.
+
+        Only the arguments you pass are changed. Passing
+        ``exclusion_rules=None`` clears the rules; omitting it leaves them
+        unchanged.
+
+        Returns:
+            self, with updated fields.
+        """
+        if self._client is None:
+            raise RuntimeError(
+                "EvaluationV2Preset has no client; fetch it via "
+                "NucleusClient.list_evaluation_v2_presets."
+            )
+        updated = self._client.update_evaluation_v2_preset(
+            self.id,
+            name=name,
+            allowed_label_matches=allowed_label_matches,
+            exclusion_rules=exclusion_rules,
+        )
+        self.__dict__.update(updated.__dict__)
+        return self
+
+    def delete(self) -> None:
+        """Delete this preset."""
+        if self._client is None:
+            raise RuntimeError("EvaluationV2Preset has no client.")
+        self._client.delete_evaluation_v2_preset(self.id)
diff --git a/pyproject.toml b/pyproject.toml
index 794811da..57004bfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.18.8"
+version = "0.18.9"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/test_evaluation_v2.py b/tests/test_evaluation_v2.py
index 829e34a2..01241d58 100644
--- a/tests/test_evaluation_v2.py
+++ b/tests/test_evaluation_v2.py
@@ -5,7 +5,14 @@
 import pytest
 import requests
 
-from nucleus import AllowedLabelMatch, EvaluationV2, NucleusClient
+from nucleus import (
+    AllowedLabelMatch,
+    BoxAreaExclusionRule,
+    EvaluationV2,
+    LabelExclusionRule,
+    MetadataExclusionRule,
+    NucleusClient,
+)
 from nucleus.data_transfer_object.evaluation_v2 import (
     EvaluationV2Charts,
     EvaluationV2FilterArgs,
@@ -15,6 +22,28 @@
 )
 
 
+def _charts_response():
+    return {
+        "mapSummary": {"mapAt50": 0.1, "mapAt75": 0.2, "mapAt5095": 0.15},
+        "perClassAp": [],
+        "confusionMatrix": [],
+        "scoreHistogram": [],
+        "computedIouRanges": [],
+        "totalCounts": {"tp": 0, "fp": 0, "fn": 0, "predsWithConfidence": 0},
+        "apBySize": {"small": None, "medium": None, "large": None},
+        "prCurve": [],
+        "tideAttribution": {
+            "truePositive": 0,
+            "localization": 0,
+            "classification": 0,
+            "both": 0,
+            "duplicate": 0,
+            "background": 0,
+            "missed": 0,
+        },
+    }
+
+
 def test_evaluation_v2_filter_args_to_api_filters():
     filters = EvaluationV2FilterArgs(
         confidence_range=RangeNum(min=0.1, max=0.9),
@@ -134,27 +163,122 @@ def test_create_evaluation_v2_then_get():
     client.connection.get.assert_called_once_with("evaluationsV2/evalv2_new")
 
 
-def test_charts_get_query_string():
-    client = MagicMock(spec=NucleusClient)
-    client.get.return_value = {
-        "mapSummary": {"mapAt50": 0.1, "mapAt75": 0.2, "mapAt5095": 0.15},
-        "perClassAp": [],
-        "confusionMatrix": [],
-        "scoreHistogram": [],
-        "computedIouRanges": [],
-        "totalCounts": {"tp": 0, "fp": 0, "fn": 0, "predsWithConfidence": 0},
-        "apBySize": {"small": None, "medium": None, "large": None},
-        "prCurve": [],
-        "tideAttribution": {
-            "truePositive": 0,
-            "localization": 0,
-            "classification": 0,
-            "both": 0,
-            "duplicate": 0,
-            "background": 0,
-            "missed": 0,
+def test_create_evaluation_v2_with_slice_and_exclusion_rules():
+    client = NucleusClient(api_key="test")
+    client.connection.make_request = MagicMock(
+        return_value={"evaluation_id": "evalv2_new", "status": "pending"}
+    )
+    client.connection.get = MagicMock(
+        return_value={
+            "id": "evalv2_new",
+            "model_run_id": "run_1",
+            "dataset_id": "ds_1",
+            "status": "pending",
+        }
+    )
+    client.create_evaluation_v2(
+        "run_1",
+        slice_id="slc_x",
+        exclusion_rules=[
+            BoxAreaExclusionRule(
+                scope="annotation", target="groundTruth", min=1024
+            ),
+            LabelExclusionRule(
+                scope="item", target="prediction", labels=["ignore"]
+            ),
+            MetadataExclusionRule(key="is_dark", op="EQ", value=True),
+            {
+                "type": "labels",
+                "scope": "item",
+                "target": "groundTruth",
+                "labels": ["x"],
+            },
+        ],
+    )
+    payload = client.connection.make_request.call_args[0][0]
+    assert payload["sliceId"] == "slc_x"
+    assert payload["exclusionRules"] == [
+        {
+            "type": "boxArea",
+            "scope": "annotation",
+            "target": "groundTruth",
+            "min": 1024,
+        },
+        {
+            "type": "labels",
+            "scope": "item",
+            "target": "prediction",
+            "labels": ["ignore"],
+        },
+        {
+            "type": "metadata",
+            "scope": "item",
+            "key": "is_dark",
+            "op": "EQ",
+            "value": True,
         },
+        {
+            "type": "labels",
+            "scope": "item",
+            "target": "groundTruth",
+            "labels": ["x"],
+        },
+    ]
+
+
+def test_evaluation_v2_filter_args_gt_area_and_slices():
+    filters = EvaluationV2FilterArgs(
+        gt_area_range=RangeNum(min=1024, max=9216),
+        slice_ids=["slc_a"],
+    )
+    assert filters.to_api_filters() == {
+        "gtAreaRange": {"min": 1024.0, "max": 9216.0},
+        "sliceIds": ["slc_a"],
     }
+
+
+def test_evaluation_v2_from_json_slice_and_exclusions():
+    # exclusion_rules as a JSON string (raw jsonb), exclusion_stats as a dict.
+    ev = EvaluationV2.from_json(
+        {
+            "id": "evalv2_1",
+            "model_run_id": "run_1",
+            "dataset_id": "ds_1",
+            "status": "succeeded",
+            "slice_id": "slc_x",
+            "exclusion_rules": '[{"type":"labels","scope":"item","target":"prediction","labels":["ignore"]}]',
+            "exclusion_stats": {"totals": {"itemsDropped": 3}},
+        }
+    )
+    assert ev.slice_id == "slc_x"
+    assert ev.exclusion_rules == [
+        {
+            "type": "labels",
+            "scope": "item",
+            "target": "prediction",
+            "labels": ["ignore"],
+        }
+    ]
+    assert ev.exclusion_stats == {"totals": {"itemsDropped": 3}}
+
+
+def test_evaluation_v2_from_json_exclusions_absent():
+    ev = EvaluationV2.from_json(
+        {
+            "id": "evalv2_1",
+            "model_run_id": "run_1",
+            "dataset_id": "ds_1",
+            "status": "succeeded",
+        }
+    )
+    assert ev.slice_id is None
+    assert ev.exclusion_rules is None
+    assert ev.exclusion_stats is None
+
+
+def test_charts_post_body():
+    client = MagicMock(spec=NucleusClient)
+    client.post.return_value = _charts_response()
     ev = EvaluationV2(
         id="evalv2_1",
         model_run_id="run_1",
@@ -164,9 +288,35 @@ def test_charts_get_query_string():
     )
     charts = ev.charts(iou_threshold=0.5)
     assert isinstance(charts, EvaluationV2Charts)
-    call_route = client.get.call_args[0][0]
-    assert "evaluationsV2/evalv2_1/charts" in call_route
-    assert "iouThreshold=0.5" in call_route
+    client.post.assert_called_once()
+    payload, route = client.post.call_args[0]
+    assert route == "evaluationsV2/evalv2_1/charts"
+    assert payload == {"iouThreshold": 0.5}
+
+
+def test_charts_with_filter_args():
+    client = MagicMock(spec=NucleusClient)
+    client.post.return_value = _charts_response()
+    ev = EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="succeeded",
+        _client=client,
+    )
+    filters = EvaluationV2FilterArgs(
+        gt_area_range=RangeNum(min=1024),
+        slice_ids=["slc_a", "slc_b"],
+    )
+    ev.charts(iou_threshold=0.75, filters=filters, query="dog")
+    payload, route = client.post.call_args[0]
+    assert route == "evaluationsV2/evalv2_1/charts"
+    assert payload["iouThreshold"] == 0.75
+    assert payload["query"] == "dog"
+    assert payload["filters"] == {
+        "gtAreaRange": {"min": 1024.0},
+        "sliceIds": ["slc_a", "slc_b"],
+    }
 
 
 def test_examples_post_body():
diff --git a/tests/test_evaluation_v2_presets.py b/tests/test_evaluation_v2_presets.py
new file mode 100644
index 00000000..f6acce30
--- /dev/null
+++ b/tests/test_evaluation_v2_presets.py
@@ -0,0 +1,327 @@
+"""Unit tests for Evaluation V2 presets, batch create, cancel/retry, and
+label-schema discovery (no live API)."""
+
+from unittest.mock import MagicMock
+
+import requests
+
+from nucleus import (
+    AllowedLabelMatch,
+    EvaluationV2,
+    EvaluationV2Preset,
+    LabelExclusionRule,
+    NucleusClient,
+)
+from nucleus.dataset import Dataset
+
+
+# --------------------------------------------------------------------------- #
+# Preset CRUD
+# --------------------------------------------------------------------------- #
+def test_list_evaluation_v2_presets():
+    client = NucleusClient(api_key="test")
+    client.connection.get = MagicMock(
+        return_value=[
+            {
+                "id": "prev_1",
+                "name": "vehicles",
+                "allowed_label_matches": [
+                    {"groundTruthLabel": "car", "modelPredictionLabel": "vehicle"}
+                ],
+                "exclusion_rules": None,
+                "created_by_user_id": "u_1",
+            }
+        ]
+    )
+    presets = client.list_evaluation_v2_presets()
+    client.connection.get.assert_called_once_with("evaluationV2Presets")
+    assert len(presets) == 1
+    assert presets[0].id == "prev_1"
+    assert presets[0].name == "vehicles"
+    assert presets[0].allowed_label_matches[0] == AllowedLabelMatch(
+        ground_truth_label="car", model_prediction_label="vehicle"
+    )
+
+
+def test_create_evaluation_v2_preset_payload():
+    client = NucleusClient(api_key="test")
+    client.connection.post = MagicMock(
+        return_value={
+            "id": "prev_1",
+            "name": "vehicles",
+            "allowed_label_matches": [],
+            "exclusion_rules": None,
+        }
+    )
+    preset = client.create_evaluation_v2_preset(
+        "vehicles",
+        allowed_label_matches=[AllowedLabelMatch("car", "vehicle")],
+        exclusion_rules=[
+            LabelExclusionRule(
+                scope="item", target="prediction", labels=["ignore"]
+            )
+        ],
+    )
+    payload, route = client.connection.post.call_args[0]
+    assert route == "evaluationV2Presets"
+    assert payload["name"] == "vehicles"
+    assert payload["allowedLabelMatches"] == [
+        {"ground_truth_label": "car", "model_prediction_label": "vehicle"}
+    ]
+    assert payload["exclusionRules"] == [
+        {
+            "type": "labels",
+            "scope": "item",
+            "target": "prediction",
+            "labels": ["ignore"],
+        }
+    ]
+    assert preset.id == "prev_1"
+
+
+def test_update_evaluation_v2_preset_name_only_omits_other_fields():
+    client = NucleusClient(api_key="test")
+    client.connection.patch = MagicMock(
+        return_value={"id": "prev_1", "name": "renamed"}
+    )
+    client.update_evaluation_v2_preset("prev_1", name="renamed")
+    payload, route = client.connection.patch.call_args[0]
+    assert route == "evaluationV2Presets/prev_1"
+    # Only the provided field is sent; matches/rules untouched.
+    assert payload == {"name": "renamed"}
+
+
+def test_update_evaluation_v2_preset_clear_rules_sends_null():
+    client = NucleusClient(api_key="test")
+    client.connection.patch = MagicMock(
+        return_value={"id": "prev_1", "name": "p"}
+    )
+    client.update_evaluation_v2_preset("prev_1", exclusion_rules=None)
+    payload = client.connection.patch.call_args[0][0]
+    # Explicit None clears the rules (distinct from "leave unchanged").
+    assert payload == {"exclusionRules": None}
+
+
+def test_delete_evaluation_v2_preset():
+    client = NucleusClient(api_key="test")
+    client.connection.make_request = MagicMock(return_value=MagicMock())
+    client.delete_evaluation_v2_preset("prev_1")
+    # NucleusClient.make_request forwards args positionally to the connection:
+    # (payload, route, requests_command, return_raw_response).
+    args = client.connection.make_request.call_args[0]
+    assert args[1] == "evaluationV2Presets/prev_1"
+    assert args[2] is requests.delete
+
+
+def test_preset_instance_update_and_delete_delegate_to_client():
+    client = MagicMock(spec=NucleusClient)
+    preset = EvaluationV2Preset(id="prev_1", name="p", _client=client)
+    client.update_evaluation_v2_preset.return_value = EvaluationV2Preset(
+        id="prev_1", name="renamed", _client=client
+    )
+    preset.update(name="renamed")
+    assert preset.name == "renamed"
+    preset.delete()
+    client.delete_evaluation_v2_preset.assert_called_once_with("prev_1")
+
+
+# --------------------------------------------------------------------------- #
+# Apply preset + only_items_with_predictions on create
+# --------------------------------------------------------------------------- #
+def _stub_create(client):
+    client.connection.make_request = MagicMock(
+        return_value={"evaluation_id": "evalv2_new"}
+    )
+    client.connection.get = MagicMock(
+        return_value={
+            "id": "evalv2_new",
+            "model_run_id": "run_1",
+            "dataset_id": "ds_1",
+            "status": "pending",
+        }
+    )
+
+
+def test_create_evaluation_v2_with_preset_seeds_config():
+    client = NucleusClient(api_key="test")
+    _stub_create(client)
+    preset = EvaluationV2Preset(
+        id="prev_1",
+        name="p",
+        allowed_label_matches=[AllowedLabelMatch("car", "vehicle")],
+        exclusion_rules=[
+            {
+                "type": "labels",
+                "scope": "item",
+                "target": "groundTruth",
+                "labels": ["x"],
+            }
+        ],
+    )
+    client.create_evaluation_v2("run_1", preset=preset)
+    payload = client.connection.make_request.call_args[0][0]
+    assert payload["allowed_label_matches"] == [
+        {"ground_truth_label": "car", "model_prediction_label": "vehicle"}
+    ]
+    assert payload["exclusionRules"] == [
+        {
+            "type": "labels",
+            "scope": "item",
+            "target": "groundTruth",
+            "labels": ["x"],
+        }
+    ]
+
+
+def test_create_evaluation_v2_explicit_args_override_preset():
+    client = NucleusClient(api_key="test")
+    _stub_create(client)
+    preset = EvaluationV2Preset(
+        id="prev_1",
+        name="p",
+        allowed_label_matches=[AllowedLabelMatch("car", "vehicle")],
+    )
+    client.create_evaluation_v2(
+        "run_1",
+        preset=preset,
+        allowed_label_matches=[AllowedLabelMatch("dog", "animal")],
+    )
+    payload = client.connection.make_request.call_args[0][0]
+    assert payload["allowed_label_matches"] == [
+        {"ground_truth_label": "dog", "model_prediction_label": "animal"}
+    ]
+
+
+def test_create_evaluation_v2_only_items_with_predictions():
+    client = NucleusClient(api_key="test")
+    _stub_create(client)
+    client.create_evaluation_v2("run_1", only_items_with_predictions=True)
+    payload = client.connection.make_request.call_args[0][0]
+    assert payload["onlyItemsWithPredictions"] is True
+
+
+# --------------------------------------------------------------------------- #
+# Batch create
+# --------------------------------------------------------------------------- #
+def test_create_evaluations_v2_batch_cross_product_and_error_capture():
+    client = NucleusClient(api_key="test")
+    seen = []
+
+    def fake_create(run, **kwargs):
+        seen.append((run, kwargs.get("slice_id"), kwargs.get("name")))
+        if run == "run_bad":
+            raise RuntimeError("boom")
+        ev = MagicMock(spec=EvaluationV2)
+        ev.id = f"eval_{run}_{kwargs.get('slice_id')}"
+        return ev
+
+    client.create_evaluation_v2 = fake_create
+    results = client.create_evaluations_v2_batch(
+        ["run_ok", "run_bad"],
+        slice_ids=["slc_1", None],
+        name_prefix="nightly",
+    )
+
+    # 2 runs x 2 targets = 4 jobs, returned in input order.
+    assert len(results) == 4
+    assert results[0].model_run_id == "run_ok"
+    assert results[0].slice_id == "slc_1"
+    assert results[0].name == "nightly — run_ok — slc_1"
+    assert results[0].succeeded
+    assert results[1].name == "nightly — run_ok"  # whole-dataset job
+    # Failures are captured per-job, not raised.
+    assert results[2].model_run_id == "run_bad"
+    assert not results[2].succeeded
+    assert results[2].error == "boom"
+
+
+def test_create_evaluations_v2_batch_defaults_to_whole_dataset():
+    client = NucleusClient(api_key="test")
+    client.create_evaluation_v2 = MagicMock(
+        return_value=MagicMock(spec=EvaluationV2)
+    )
+    results = client.create_evaluations_v2_batch(["run_1", "run_2"])
+    assert len(results) == 2
+    # No slice_ids -> one whole-dataset job per run.
+    for call in client.create_evaluation_v2.call_args_list:
+        assert call.kwargs["slice_id"] is None
+
+
+# --------------------------------------------------------------------------- #
+# Cancel / retry
+# --------------------------------------------------------------------------- #
+def _eval(client, status="computing"):
+    return EvaluationV2(
+        id="evalv2_1",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status=status,
+        _client=client,
+    )
+
+
+def test_evaluation_cancel_posts_and_refreshes():
+    client = MagicMock(spec=NucleusClient)
+    client.get.return_value = {
+        "id": "evalv2_1",
+        "model_run_id": "run_1",
+        "dataset_id": "ds_1",
+        "status": "cancelled",
+    }
+    ev = _eval(client)
+    ev.cancel()
+    args, kwargs = client.make_request.call_args
+    assert args[1] == "evaluationsV2/evalv2_1/cancel"
+    assert kwargs["requests_command"] is requests.post
+    assert ev.status == "cancelled"
+
+
+def test_evaluation_retry_resolves_new_evaluation():
+    client = MagicMock(spec=NucleusClient)
+    client.post.return_value = {"evaluation_id": "evalv2_retry"}
+    client.get_evaluation_v2.return_value = EvaluationV2(
+        id="evalv2_retry",
+        model_run_id="run_1",
+        dataset_id="ds_1",
+        status="pending",
+        _client=client,
+    )
+    ev = _eval(client, status="failed")
+    new_eval = ev.retry()
+    _, route = client.post.call_args[0]
+    assert route == "evaluationsV2/evalv2_1/retry"
+    assert new_eval.id == "evalv2_retry"
+    client.get_evaluation_v2.assert_called_once_with("evalv2_retry")
+
+
+# --------------------------------------------------------------------------- #
+# Examples optional match_type
+# --------------------------------------------------------------------------- #
+def test_examples_match_type_optional():
+    client = MagicMock(spec=NucleusClient)
+    client.post.return_value = {"rows": [], "total": 0}
+    ev = _eval(client, status="succeeded")
+
+    ev.examples()
+    payload = client.post.call_args[0][0]
+    assert "match_type" not in payload
+
+    ev.examples(match_type="FP")
+    payload2 = client.post.call_args[0][0]
+    assert payload2["match_type"] == "FP"
+
+
+# --------------------------------------------------------------------------- #
+# Label schema discovery
+# --------------------------------------------------------------------------- #
+def test_dataset_evaluation_label_schema():
+    client = NucleusClient(api_key="test")
+    client.connection.make_request = MagicMock(
+        return_value={"gt_labels": ["car"], "prediction_labels": ["vehicle"]}
+    )
+    dataset = Dataset("ds_1", client)
+    out = dataset.evaluation_label_schema()
+    assert out == {"gt_labels": ["car"], "prediction_labels": ["vehicle"]}
+    args = client.connection.make_request.call_args[0]
+    assert args[1] == "dataset/ds_1/labelSchema"
+    assert args[2] is requests.get

From 3a29f12aef3dd629223ede9db06842f1af9927b3 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 25 Jun 2026 16:45:09 -0500
Subject: [PATCH 2/5] remove verbose comment

---
 nucleus/__init__.py                 | 2 +-
 nucleus/dataset.py                  | 2 +-
 nucleus/evaluation_v2_exclusions.py | 1 -
 nucleus/evaluation_v2_preset.py     | 1 -
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index a8591c77..8e098786 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -1007,7 +1007,7 @@ def create_evaluations_v2_batch(
     ) -> List[BatchEvaluationResult]:
         """Create many evaluations at once, sharing one configuration.
 
-        Mirrors the batch-create flow in the UI: one evaluation is created for
+        One evaluation is created for
         every ``(model_run_id, slice_id)`` pair (the cross-product of
         ``model_run_ids`` and ``slice_ids``), all sharing the same matches,
         exclusion rules, and options. Jobs run concurrently and failures are
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
index 6f705c81..f7077798 100644
--- a/nucleus/dataset.py
+++ b/nucleus/dataset.py
@@ -233,7 +233,7 @@ def evaluation_label_schema(self) -> Dict[str, List[str]]:
 
         Useful for building :meth:`NucleusClient.create_evaluation_v2`
         ``allowed_label_matches`` and label exclusion rules without guessing
-        label names. Mirrors the label lists shown in the Create Evaluation UI.
+        label names.
 
         Returns:
             A dict with ``"gt_labels"`` (ground-truth annotation labels) and
diff --git a/nucleus/evaluation_v2_exclusions.py b/nucleus/evaluation_v2_exclusions.py
index c529f2e1..6669256e 100644
--- a/nucleus/evaluation_v2_exclusions.py
+++ b/nucleus/evaluation_v2_exclusions.py
@@ -1,6 +1,5 @@
 """Exclusion rules for Evaluation V2 creation.
 
-Mirrors ``packages/shared/src/nucleus/evaluationV2Exclusions.ts`` on the backend.
 These rules drop items/annotations from an evaluation before metrics are computed.
 
 The per-rule shape is validated server-side at create time
diff --git a/nucleus/evaluation_v2_preset.py b/nucleus/evaluation_v2_preset.py
index 36bf3630..09173ab8 100644
--- a/nucleus/evaluation_v2_preset.py
+++ b/nucleus/evaluation_v2_preset.py
@@ -4,7 +4,6 @@
 so the same configuration can be applied across many evaluations. Presets are
 private to the creating user.
 
-Mirrors the ``/v1/nucleus/evaluationV2Presets`` REST endpoints on the backend.
 Create and manage presets via :class:`~nucleus.NucleusClient`::
 
     preset = client.create_evaluation_v2_preset(

From 81b13811a0b7e770facc2eed508a8583f350cd4f Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 25 Jun 2026 16:55:06 -0500
Subject: [PATCH 3/5] greptile

---
 nucleus/evaluation_v2_preset.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/nucleus/evaluation_v2_preset.py b/nucleus/evaluation_v2_preset.py
index 09173ab8..87258831 100644
--- a/nucleus/evaluation_v2_preset.py
+++ b/nucleus/evaluation_v2_preset.py
@@ -64,8 +64,13 @@ def from_json(
             name=str(payload["name"]),
             allowed_label_matches=parse_allowed_label_matches(
                 payload.get("allowed_label_matches")
+                or payload.get("allowedLabelMatches")
+            ),
+            exclusion_rules=_parse_json_field(
+                payload.get("exclusion_rules")
+                if payload.get("exclusion_rules") is not None
+                else payload.get("exclusionRules")
             ),
-            exclusion_rules=_parse_json_field(payload.get("exclusion_rules")),
             created_by_user_id=payload.get("created_by_user_id"),
             created_at=payload.get("created_at"),
             updated_at=payload.get("updated_at"),

From 3b73b6c69ab7b2dc8ea626aedf977d0363337a63 Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 25 Jun 2026 17:36:38 -0500
Subject: [PATCH 4/5] remove api doc add

---
 docs/index.rst | 34 ++--------------------------------
 1 file changed, 2 insertions(+), 32 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 88ec8c3d..cb310bab 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,36 +12,6 @@ Scale Nucleus helps you:
 
 Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.
 
-.. _evaluations-v2:
-
-Evaluations V2
---------------
-
-Evaluation V2 measures how well a **model run** matches ground-truth annotations.
-Create a run with :meth:`NucleusClient.create_evaluation_v2`, wait with
-:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`, then read summary metrics with
-:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or individual matches with
-:meth:`nucleus.evaluation_v2.EvaluationV2.examples`.
-
-.. code-block:: python
-
-   import nucleus
-
-   client = nucleus.NucleusClient(api_key="YOUR_API_KEY")
-   evaluation = client.create_evaluation_v2(
-       model_run_id="run_xxx",
-       name="my-eval",
-       allowed_label_matches=[
-           nucleus.AllowedLabelMatch(
-               ground_truth_label="car",
-               model_prediction_label="vehicle",
-           ),
-       ],
-   )
-   evaluation.wait_for_completion()
-   charts = evaluation.charts(iou_threshold=0.5)
-   fps = evaluation.examples(match_type="FP", limit=20)
-
 .. _installation:
 
 Installation
@@ -56,8 +26,8 @@ To use Nucleus, first install it using `pip`:
 
 .. _api:
 
-Sections
---------
+API Reference
+-------------
 
 .. toctree::
    :maxdepth: 4

From 7cc33e79732864f64f0eaad691d893ac5d5a460d Mon Sep 17 00:00:00 2001
From: Luke Schaefer <luke.schaefer@scale.com>
Date: Thu, 25 Jun 2026 17:41:34 -0500
Subject: [PATCH 5/5] greptile

---
 nucleus/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index 8e098786..efbc2261 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -958,7 +958,7 @@ def create_evaluation_v2(
             :class:`EvaluationV2`: The created evaluation.
         """
         if preset is not None:
-            if allowed_label_matches is None:
+            if allowed_label_matches is None and allowed_label_matches_id is None:
                 allowed_label_matches = preset.allowed_label_matches
             if exclusion_rules is None and preset.exclusion_rules is not None:
                 exclusion_rules = list(preset.exclusion_rules)