scaleapi · luke-e-schaefer · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.18.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.9) - 2026-06-25
+
+### Added
+- **Evaluations V2 slice scoping and exclusion rules.** `create_evaluation_v2()` accepts `slice_id` (restrict the evaluation to a slice's items) and `exclusion_rules` (drop items/annotations before metrics are computed) via the new `MetadataExclusionRule`, `LabelExclusionRule`, and `BoxAreaExclusionRule` types (or equivalent dicts). The `EvaluationV2` resource exposes `slice_id`, `exclusion_rules`, and `exclusion_stats`. `EvaluationV2FilterArgs` gains `gt_area_range` (filter by ground-truth box area, e.g. COCO small/medium/large bands) and `slice_ids`, applied by both `charts()` and `examples()`.
+- **Evaluation V2 presets.** Save and reuse evaluation configurations (`name` + `allowed_label_matches` + `exclusion_rules`) via `NucleusClient.list_evaluation_v2_presets()`, `create_evaluation_v2_preset()`, `update_evaluation_v2_preset()`, and `delete_evaluation_v2_preset()`, plus the new `EvaluationV2Preset` resource (with `update()` / `delete()`). Apply a preset directly when creating an evaluation: `create_evaluation_v2(model_run_id, preset=preset)` seeds the matches and rules (explicit arguments override the preset).
+- `create_evaluation_v2()` accepts `only_items_with_predictions` to restrict the evaluation to items that have at least one prediction.
+- **Batch create.** `create_evaluations_v2_batch()` creates one evaluation per `(model_run_id, slice_id)` pair with a shared configuration, running concurrently and returning a `BatchEvaluationResult` per job (capturing the created evaluation or the per-job error).
+- **Cancel & retry.** `EvaluationV2.cancel()` stops a running evaluation; `EvaluationV2.retry()` re-runs a failed one, reusing its slice/matches/exclusion rules.
+- `Dataset.evaluation_label_schema()` returns the dataset's ground-truth and prediction label vocabularies (`gt_labels` / `prediction_labels`) for building label matches and label exclusion rules.
+
+### Changed
+- `EvaluationV2.examples()` now treats `match_type` as optional — omit it to return examples of all match types.
+
+### Fixed
+- `EvaluationV2.charts()` issues a `POST` (matching the backend route) instead of a `GET` with a query string, which did not reach the server.
+
 ## [0.18.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.8) - 2026-06-17
 
 ### Fixed

diff --git a/docs/index.rst b/docs/index.rst
@@ -12,36 +12,6 @@ Scale Nucleus helps you:
 
 Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.
 
-.. _evaluations-v2:
-
-Evaluations V2
---------------
-
-Evaluation V2 measures how well a **model run** matches ground-truth annotations.
-Create a run with :meth:`NucleusClient.create_evaluation_v2`, wait with
-:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`, then read summary metrics with
-:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or individual matches with
-:meth:`nucleus.evaluation_v2.EvaluationV2.examples`.
-
-.. code-block:: python
-
-   import nucleus
-
-   client = nucleus.NucleusClient(api_key="YOUR_API_KEY")
-   evaluation = client.create_evaluation_v2(
-       model_run_id="run_xxx",
-       name="my-eval",
-       allowed_label_matches=[
-           nucleus.AllowedLabelMatch(
-               ground_truth_label="car",
-               model_prediction_label="vehicle",
-           ),
-       ],
-   )
-   evaluation.wait_for_completion()
-   charts = evaluation.charts(iou_threshold=0.5)
-   fps = evaluation.examples(match_type="FP", limit=20)
-
 .. _installation:
 
 Installation
@@ -56,8 +26,8 @@ To use Nucleus, first install it using `pip`:
 
 .. _api:
 
-Sections
---------
+API Reference
+-------------
 
 .. toctree::
    :maxdepth: 4

diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -3,6 +3,7 @@
 __all__ = [
     "AsyncJob",
     "AllowedLabelMatch",
+    "BatchEvaluationResult",
     "EmbeddingsExportJob",
     "BoxAnnotation",
     "DeduplicationJob",
@@ -24,7 +25,11 @@
     "EvaluationV2ExamplesPage",
     "EvaluationV2FilterArgs",
     "EvaluationV2MatchExample",
+    "EvaluationV2Preset",
     "EvaluationV2Status",
+    "MetadataExclusionRule",
+    "LabelExclusionRule",
+    "BoxAreaExclusionRule",
     "Frame",
     "Keypoint",
     "KeypointsAnnotation",
@@ -57,6 +62,7 @@
 import datetime
 import os
 import warnings
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import requests
@@ -161,7 +167,19 @@
     NotFoundError,
     NucleusAPIError,
 )
-from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
+from .evaluation_v2 import (
+    AllowedLabelMatch,
+    BatchEvaluationResult,
+    EvaluationV2,
+    EvaluationV2Status,
+)
+from .evaluation_v2_exclusions import (
+    BoxAreaExclusionRule,
+    EvaluationV2ExclusionRule,
+    LabelExclusionRule,
+    MetadataExclusionRule,
+)
+from .evaluation_v2_preset import _UNSET, EvaluationV2Preset
 from .job import CustomerJobTypes
 from .local_deduplication import (
     LocalDeduplicationResult,
@@ -902,6 +920,12 @@ def create_evaluation_v2(
         name: Optional[str] = None,
         allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
         allowed_label_matches_id: Optional[str] = None,
+        slice_id: Optional[str] = None,
+        exclusion_rules: Optional[
+            List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
+        ] = None,
+        only_items_with_predictions: bool = False,
+        preset: Optional[EvaluationV2Preset] = None,
     ) -> EvaluationV2:
         """Create an evaluation for a model run.
 
@@ -914,10 +938,30 @@ def create_evaluation_v2(
             name: Optional display name.
             allowed_label_matches: Optional label pairs to treat as matches.
             allowed_label_matches_id: Optional id of a saved label-match configuration.
+            slice_id: Optional slice id (``slc_*``) to scope the evaluation to the
+                items in that slice. Must belong to the model run's dataset.
+            exclusion_rules: Optional rules that drop items/annotations before metrics
+                are computed. Each entry is a
+                :class:`~nucleus.evaluation_v2_exclusions.MetadataExclusionRule`,
+                :class:`~nucleus.evaluation_v2_exclusions.LabelExclusionRule`, or
+                :class:`~nucleus.evaluation_v2_exclusions.BoxAreaExclusionRule`
+                (or an equivalent plain dict). Per-rule validation happens server-side;
+                a malformed rule rejects the whole request with a descriptive error.
+            only_items_with_predictions: If ``True``, restrict the evaluation to
+                items that have at least one model prediction.
+            preset: Optional :class:`EvaluationV2Preset` whose
+                ``allowed_label_matches`` and ``exclusion_rules`` seed this
+                evaluation. Explicit ``allowed_label_matches`` / ``exclusion_rules``
+                arguments take precedence over the preset's values.
 
         Returns:
             :class:`EvaluationV2`: The created evaluation.
         """
+        if preset is not None:
+            if allowed_label_matches is None and allowed_label_matches_id is None:
+                allowed_label_matches = preset.allowed_label_matches
+            if exclusion_rules is None and preset.exclusion_rules is not None:
+                exclusion_rules = list(preset.exclusion_rules)
         payload: Dict[str, Any] = {}
         if name is not None:
             payload["name"] = name
@@ -927,6 +971,15 @@ def create_evaluation_v2(
             ]
         if allowed_label_matches_id is not None:
             payload["allowed_label_matches_id"] = allowed_label_matches_id
+        if slice_id is not None:
+            payload["sliceId"] = slice_id
+        if exclusion_rules is not None:
+            payload["exclusionRules"] = [
+                rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule
+                for rule in exclusion_rules
+            ]
+        if only_items_with_predictions:
+            payload["onlyItemsWithPredictions"] = True
         result = self.make_request(
             payload, f"modelRun/{model_run_id}/evaluationsV2"
         )
@@ -937,6 +990,96 @@ def create_evaluation_v2(
             )
         return self.get_evaluation_v2(str(eval_id))
 
+    def create_evaluations_v2_batch(
+        self,
+        model_run_ids: List[str],
+        *,
+        slice_ids: Optional[List[Optional[str]]] = None,
+        name_prefix: Optional[str] = None,
+        allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
+        allowed_label_matches_id: Optional[str] = None,
+        exclusion_rules: Optional[
+            List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
+        ] = None,
+        only_items_with_predictions: bool = False,
+        preset: Optional[EvaluationV2Preset] = None,
+        max_workers: int = 4,
+    ) -> List[BatchEvaluationResult]:
+        """Create many evaluations at once, sharing one configuration.
+
+        One evaluation is created for
+        every ``(model_run_id, slice_id)`` pair (the cross-product of
+        ``model_run_ids`` and ``slice_ids``), all sharing the same matches,
+        exclusion rules, and options. Jobs run concurrently and failures are
+        captured per job rather than aborting the batch.
+
+        Parameters:
+            model_run_ids: Model run ids (``run_*``) to evaluate.
+            slice_ids: Slice ids (``slc_*``) to scope each evaluation to. Use
+                ``None`` within the list for a whole-dataset evaluation. Defaults
+                to ``[None]`` (whole dataset for every run).
+            name_prefix: Optional name prefix; the run id and/or slice id are
+                appended to keep batch names unique.
+            allowed_label_matches: Shared label-match pairs (see
+                :meth:`create_evaluation_v2`).
+            allowed_label_matches_id: Shared saved label-match config id.
+            exclusion_rules: Shared exclusion rules.
+            only_items_with_predictions: Shared "only items with predictions" flag.
+            preset: Optional preset seeding matches/rules for every job.
+            max_workers: Maximum concurrent create requests (default 4).
+
+        Returns:
+            List of :class:`BatchEvaluationResult`, in input order — each holds
+            the created :class:`EvaluationV2` or the error for that job.
+        """
+        if not model_run_ids:
+            return []
+        targets: List[Optional[str]] = (
+            list(slice_ids) if slice_ids is not None else [None]
+        )
+        jobs: List[Tuple[str, Optional[str]]] = [
+            (run, sl) for run in model_run_ids for sl in targets
+        ]
+
+        def _name(run: str, sl: Optional[str]) -> Optional[str]:
+            if name_prefix is None:
+                return None
+            parts = [name_prefix]
+            if len(model_run_ids) > 1:
+                parts.append(run)
+            if sl is not None:
+                parts.append(sl)
+            return " — ".join(parts)
+
+        results: List[Optional[BatchEvaluationResult]] = [None] * len(jobs)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_idx = {
+                executor.submit(
+                    self.create_evaluation_v2,
+                    run,
+                    name=_name(run, sl),
+                    allowed_label_matches=allowed_label_matches,
+                    allowed_label_matches_id=allowed_label_matches_id,
+                    slice_id=sl,
+                    exclusion_rules=exclusion_rules,
+                    only_items_with_predictions=only_items_with_predictions,
+                    preset=preset,
+                ): idx
+                for idx, (run, sl) in enumerate(jobs)
+            }
+            for future in as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                run, sl = jobs[idx]
+                result = BatchEvaluationResult(
+                    model_run_id=run, slice_id=sl, name=_name(run, sl)
+                )
+                try:
+                    result.evaluation = future.result()
+                except Exception as exc:  # noqa: BLE001 - reported per job
+                    result.error = str(exc)
+                results[idx] = result
+        return [r for r in results if r is not None]
+
     def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
         """Get an evaluation by id.
 
@@ -965,6 +1108,111 @@ def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]:
             )
         return [EvaluationV2.from_json(r, self) for r in rows]
 
+    def list_evaluation_v2_presets(self) -> List[EvaluationV2Preset]:
+        """List the current user's saved Evaluation V2 presets.
+
+        Returns:
+            List of :class:`EvaluationV2Preset` (presets are private per user).
+        """
+        rows = self.get("evaluationV2Presets")
+        if not isinstance(rows, list):
+            raise RuntimeError(
+                f"Unexpected list evaluation V2 presets response: {rows!r}"
+            )
+        return [EvaluationV2Preset.from_json(r, self) for r in rows]
+
+    def create_evaluation_v2_preset(
+        self,
+        name: str,
+        *,
+        allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
+        exclusion_rules: Optional[
+            List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
+        ] = None,
+    ) -> EvaluationV2Preset:
+        """Create a saved Evaluation V2 preset.
+
+        Parameters:
+            name: Preset name. Must be non-empty and unique among the user's
+                presets.
+            allowed_label_matches: Optional label pairs to treat as matches.
+            exclusion_rules: Optional rules that drop items/annotations (same
+                types accepted by :meth:`create_evaluation_v2`).
+
+        Returns:
+            :class:`EvaluationV2Preset`: The created preset.
+        """
+        payload: Dict[str, Any] = {"name": name}
+        if allowed_label_matches is not None:
+            payload["allowedLabelMatches"] = [
+                m.to_api_dict() for m in allowed_label_matches
+            ]
+        if exclusion_rules is not None:
+            payload["exclusionRules"] = [
+                rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule
+                for rule in exclusion_rules
+            ]
+        data = self.post(payload, "evaluationV2Presets")
+        return EvaluationV2Preset.from_json(data, self)
+
+    def update_evaluation_v2_preset(
+        self,
+        preset_id: str,
+        *,
+        name: Any = _UNSET,
+        allowed_label_matches: Any = _UNSET,
+        exclusion_rules: Any = _UNSET,
+    ) -> EvaluationV2Preset:
+        """Update a saved Evaluation V2 preset.
+
+        Only the fields you pass are changed. Passing ``exclusion_rules=None``
+        clears the rules; omitting an argument leaves that field unchanged.
+
+        Parameters:
+            preset_id: Preset id (``prev_*``). Must be owned by the caller.
+            name: Optional new name.
+            allowed_label_matches: Optional new label-match list.
+            exclusion_rules: Optional new exclusion rules, or ``None`` to clear.
+
+        Returns:
+            :class:`EvaluationV2Preset`: The updated preset.
+        """
+        payload: Dict[str, Any] = {}
+        if name is not _UNSET:
+            payload["name"] = name
+        if allowed_label_matches is not _UNSET:
+            payload["allowedLabelMatches"] = (
+                None
+                if allowed_label_matches is None
+                else [m.to_api_dict() for m in allowed_label_matches]
+            )
+        if exclusion_rules is not _UNSET:
+            payload["exclusionRules"] = (
+                None
+                if exclusion_rules is None
+                else [
+                    rule.to_api_dict()
+                    if hasattr(rule, "to_api_dict")
+                    else rule
+                    for rule in exclusion_rules
+                ]
+            )
+        data = self.patch(payload, f"evaluationV2Presets/{preset_id}")
+        return EvaluationV2Preset.from_json(data, self)
+
+    def delete_evaluation_v2_preset(self, preset_id: str) -> None:
+        """Delete a saved Evaluation V2 preset.
+
+        Parameters:
+            preset_id: Preset id (``prev_*``). Must be owned by the caller.
+        """
+        self.make_request(
+            {},
+            f"evaluationV2Presets/{preset_id}",
+            requests_command=requests.delete,
+            return_raw_response=True,
+        )
+
     @deprecated(msg="Prefer calling Dataset.info() directly.")
     def dataset_info(self, dataset_id: str):
         dataset = self.get_dataset(dataset_id)
@@ -1316,6 +1564,9 @@ def delete(self, route: str):
     def get(self, route: str):
         return self.connection.get(route)
 
+    def patch(self, payload: dict, route: str):
+        return self.connection.patch(payload, route)
+
     def post(self, payload: dict, route: str):
         return self.connection.post(payload, route)