Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.18.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.9) - 2026-06-25

### Added
- **Evaluations V2 slice scoping and exclusion rules.** `create_evaluation_v2()` accepts `slice_id` (restrict the evaluation to a slice's items) and `exclusion_rules` (drop items/annotations before metrics are computed) via the new `MetadataExclusionRule`, `LabelExclusionRule`, and `BoxAreaExclusionRule` types (or equivalent dicts). The `EvaluationV2` resource exposes `slice_id`, `exclusion_rules`, and `exclusion_stats`. `EvaluationV2FilterArgs` gains `gt_area_range` (filter by ground-truth box area, e.g. COCO small/medium/large bands) and `slice_ids`, applied by both `charts()` and `examples()`.
- **Evaluation V2 presets.** Save and reuse evaluation configurations (`name` + `allowed_label_matches` + `exclusion_rules`) via `NucleusClient.list_evaluation_v2_presets()`, `create_evaluation_v2_preset()`, `update_evaluation_v2_preset()`, and `delete_evaluation_v2_preset()`, plus the new `EvaluationV2Preset` resource (with `update()` / `delete()`). Apply a preset directly when creating an evaluation: `create_evaluation_v2(model_run_id, preset=preset)` seeds the matches and rules (explicit arguments override the preset).
- `create_evaluation_v2()` accepts `only_items_with_predictions` to restrict the evaluation to items that have at least one prediction.
- **Batch create.** `create_evaluations_v2_batch()` creates one evaluation per `(model_run_id, slice_id)` pair with a shared configuration, running concurrently and returning a `BatchEvaluationResult` per job (capturing the created evaluation or the per-job error).
- **Cancel & retry.** `EvaluationV2.cancel()` stops a running evaluation; `EvaluationV2.retry()` re-runs a failed one, reusing its slice/matches/exclusion rules.
- `Dataset.evaluation_label_schema()` returns the dataset's ground-truth and prediction label vocabularies (`gt_labels` / `prediction_labels`) for building label matches and label exclusion rules.

### Changed
- `EvaluationV2.examples()` now treats `match_type` as optional — omit it to return examples of all match types.

### Fixed
- `EvaluationV2.charts()` issues a `POST` (matching the backend route) instead of a `GET` with a query string, which did not reach the server.

## [0.18.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.8) - 2026-06-17

### Fixed
Expand Down
34 changes: 2 additions & 32 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,6 @@ Scale Nucleus helps you:

Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.

.. _evaluations-v2:

Evaluations V2
--------------

Evaluation V2 measures how well a **model run** matches ground-truth annotations.
Create a run with :meth:`NucleusClient.create_evaluation_v2`, wait with
:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`, then read summary metrics with
:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or individual matches with
:meth:`nucleus.evaluation_v2.EvaluationV2.examples`.

.. code-block:: python

import nucleus

client = nucleus.NucleusClient(api_key="YOUR_API_KEY")
evaluation = client.create_evaluation_v2(
model_run_id="run_xxx",
name="my-eval",
allowed_label_matches=[
nucleus.AllowedLabelMatch(
ground_truth_label="car",
model_prediction_label="vehicle",
),
],
)
evaluation.wait_for_completion()
charts = evaluation.charts(iou_threshold=0.5)
fps = evaluation.examples(match_type="FP", limit=20)

.. _installation:

Installation
Expand All @@ -56,8 +26,8 @@ To use Nucleus, first install it using `pip`:

.. _api:

Sections
--------
API Reference
-------------

.. toctree::
:maxdepth: 4
Expand Down
253 changes: 252 additions & 1 deletion nucleus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
__all__ = [
"AsyncJob",
"AllowedLabelMatch",
"BatchEvaluationResult",
"EmbeddingsExportJob",
"BoxAnnotation",
"DeduplicationJob",
Expand All @@ -24,7 +25,11 @@
"EvaluationV2ExamplesPage",
"EvaluationV2FilterArgs",
"EvaluationV2MatchExample",
"EvaluationV2Preset",
"EvaluationV2Status",
"MetadataExclusionRule",
"LabelExclusionRule",
"BoxAreaExclusionRule",
"Frame",
"Keypoint",
"KeypointsAnnotation",
Expand Down Expand Up @@ -57,6 +62,7 @@
import datetime
import os
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union

import requests
Expand Down Expand Up @@ -161,7 +167,19 @@
NotFoundError,
NucleusAPIError,
)
from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status
from .evaluation_v2 import (
AllowedLabelMatch,
BatchEvaluationResult,
EvaluationV2,
EvaluationV2Status,
)
from .evaluation_v2_exclusions import (
BoxAreaExclusionRule,
EvaluationV2ExclusionRule,
LabelExclusionRule,
MetadataExclusionRule,
)
from .evaluation_v2_preset import _UNSET, EvaluationV2Preset
from .job import CustomerJobTypes
from .local_deduplication import (
LocalDeduplicationResult,
Expand Down Expand Up @@ -902,6 +920,12 @@ def create_evaluation_v2(
name: Optional[str] = None,
allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
allowed_label_matches_id: Optional[str] = None,
slice_id: Optional[str] = None,
exclusion_rules: Optional[
List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
] = None,
only_items_with_predictions: bool = False,
preset: Optional[EvaluationV2Preset] = None,
) -> EvaluationV2:
"""Create an evaluation for a model run.

Expand All @@ -914,10 +938,30 @@ def create_evaluation_v2(
name: Optional display name.
allowed_label_matches: Optional label pairs to treat as matches.
allowed_label_matches_id: Optional id of a saved label-match configuration.
slice_id: Optional slice id (``slc_*``) to scope the evaluation to the
items in that slice. Must belong to the model run's dataset.
exclusion_rules: Optional rules that drop items/annotations before metrics
are computed. Each entry is a
:class:`~nucleus.evaluation_v2_exclusions.MetadataExclusionRule`,
:class:`~nucleus.evaluation_v2_exclusions.LabelExclusionRule`, or
:class:`~nucleus.evaluation_v2_exclusions.BoxAreaExclusionRule`
(or an equivalent plain dict). Per-rule validation happens server-side;
a malformed rule rejects the whole request with a descriptive error.
only_items_with_predictions: If ``True``, restrict the evaluation to
items that have at least one model prediction.
preset: Optional :class:`EvaluationV2Preset` whose
``allowed_label_matches`` and ``exclusion_rules`` seed this
evaluation. Explicit ``allowed_label_matches`` / ``exclusion_rules``
arguments take precedence over the preset's values.

Returns:
:class:`EvaluationV2`: The created evaluation.
"""
if preset is not None:
if allowed_label_matches is None and allowed_label_matches_id is None:
allowed_label_matches = preset.allowed_label_matches
if exclusion_rules is None and preset.exclusion_rules is not None:
exclusion_rules = list(preset.exclusion_rules)
Comment thread
luke-e-schaefer marked this conversation as resolved.
payload: Dict[str, Any] = {}
if name is not None:
payload["name"] = name
Expand All @@ -927,6 +971,15 @@ def create_evaluation_v2(
]
if allowed_label_matches_id is not None:
payload["allowed_label_matches_id"] = allowed_label_matches_id
if slice_id is not None:
payload["sliceId"] = slice_id
if exclusion_rules is not None:
payload["exclusionRules"] = [
rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule
for rule in exclusion_rules
]
if only_items_with_predictions:
payload["onlyItemsWithPredictions"] = True
result = self.make_request(
payload, f"modelRun/{model_run_id}/evaluationsV2"
)
Expand All @@ -937,6 +990,96 @@ def create_evaluation_v2(
)
return self.get_evaluation_v2(str(eval_id))

def create_evaluations_v2_batch(
self,
model_run_ids: List[str],
*,
slice_ids: Optional[List[Optional[str]]] = None,
name_prefix: Optional[str] = None,
allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
allowed_label_matches_id: Optional[str] = None,
exclusion_rules: Optional[
List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
] = None,
only_items_with_predictions: bool = False,
preset: Optional[EvaluationV2Preset] = None,
max_workers: int = 4,
) -> List[BatchEvaluationResult]:
"""Create many evaluations at once, sharing one configuration.

One evaluation is created for
every ``(model_run_id, slice_id)`` pair (the cross-product of
``model_run_ids`` and ``slice_ids``), all sharing the same matches,
exclusion rules, and options. Jobs run concurrently and failures are
captured per job rather than aborting the batch.

Parameters:
model_run_ids: Model run ids (``run_*``) to evaluate.
slice_ids: Slice ids (``slc_*``) to scope each evaluation to. Use
``None`` within the list for a whole-dataset evaluation. Defaults
to ``[None]`` (whole dataset for every run).
name_prefix: Optional name prefix; the run id and/or slice id are
appended to keep batch names unique.
allowed_label_matches: Shared label-match pairs (see
:meth:`create_evaluation_v2`).
allowed_label_matches_id: Shared saved label-match config id.
exclusion_rules: Shared exclusion rules.
only_items_with_predictions: Shared "only items with predictions" flag.
preset: Optional preset seeding matches/rules for every job.
max_workers: Maximum concurrent create requests (default 4).

Returns:
List of :class:`BatchEvaluationResult`, in input order — each holds
the created :class:`EvaluationV2` or the error for that job.
"""
if not model_run_ids:
return []
targets: List[Optional[str]] = (
list(slice_ids) if slice_ids is not None else [None]
)
jobs: List[Tuple[str, Optional[str]]] = [
(run, sl) for run in model_run_ids for sl in targets
]

def _name(run: str, sl: Optional[str]) -> Optional[str]:
if name_prefix is None:
return None
parts = [name_prefix]
if len(model_run_ids) > 1:
parts.append(run)
if sl is not None:
parts.append(sl)
return " — ".join(parts)

results: List[Optional[BatchEvaluationResult]] = [None] * len(jobs)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_idx = {
executor.submit(
self.create_evaluation_v2,
run,
name=_name(run, sl),
allowed_label_matches=allowed_label_matches,
allowed_label_matches_id=allowed_label_matches_id,
slice_id=sl,
exclusion_rules=exclusion_rules,
only_items_with_predictions=only_items_with_predictions,
preset=preset,
): idx
for idx, (run, sl) in enumerate(jobs)
}
for future in as_completed(future_to_idx):
idx = future_to_idx[future]
run, sl = jobs[idx]
result = BatchEvaluationResult(
model_run_id=run, slice_id=sl, name=_name(run, sl)
)
try:
result.evaluation = future.result()
except Exception as exc: # noqa: BLE001 - reported per job
result.error = str(exc)
results[idx] = result
return [r for r in results if r is not None]

def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2:
"""Get an evaluation by id.

Expand Down Expand Up @@ -965,6 +1108,111 @@ def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]:
)
return [EvaluationV2.from_json(r, self) for r in rows]

def list_evaluation_v2_presets(self) -> List[EvaluationV2Preset]:
"""List the current user's saved Evaluation V2 presets.

Returns:
List of :class:`EvaluationV2Preset` (presets are private per user).
"""
rows = self.get("evaluationV2Presets")
if not isinstance(rows, list):
raise RuntimeError(
f"Unexpected list evaluation V2 presets response: {rows!r}"
)
return [EvaluationV2Preset.from_json(r, self) for r in rows]

def create_evaluation_v2_preset(
self,
name: str,
*,
allowed_label_matches: Optional[List[AllowedLabelMatch]] = None,
exclusion_rules: Optional[
List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]]
] = None,
) -> EvaluationV2Preset:
"""Create a saved Evaluation V2 preset.

Parameters:
name: Preset name. Must be non-empty and unique among the user's
presets.
allowed_label_matches: Optional label pairs to treat as matches.
exclusion_rules: Optional rules that drop items/annotations (same
types accepted by :meth:`create_evaluation_v2`).

Returns:
:class:`EvaluationV2Preset`: The created preset.
"""
payload: Dict[str, Any] = {"name": name}
if allowed_label_matches is not None:
payload["allowedLabelMatches"] = [
m.to_api_dict() for m in allowed_label_matches
]
if exclusion_rules is not None:
payload["exclusionRules"] = [
rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule
for rule in exclusion_rules
]
data = self.post(payload, "evaluationV2Presets")
return EvaluationV2Preset.from_json(data, self)

def update_evaluation_v2_preset(
self,
preset_id: str,
*,
name: Any = _UNSET,
allowed_label_matches: Any = _UNSET,
exclusion_rules: Any = _UNSET,
) -> EvaluationV2Preset:
"""Update a saved Evaluation V2 preset.

Only the fields you pass are changed. Passing ``exclusion_rules=None``
clears the rules; omitting an argument leaves that field unchanged.

Parameters:
preset_id: Preset id (``prev_*``). Must be owned by the caller.
name: Optional new name.
allowed_label_matches: Optional new label-match list.
exclusion_rules: Optional new exclusion rules, or ``None`` to clear.

Returns:
:class:`EvaluationV2Preset`: The updated preset.
"""
payload: Dict[str, Any] = {}
if name is not _UNSET:
payload["name"] = name
if allowed_label_matches is not _UNSET:
payload["allowedLabelMatches"] = (
None
if allowed_label_matches is None
else [m.to_api_dict() for m in allowed_label_matches]
)
if exclusion_rules is not _UNSET:
payload["exclusionRules"] = (
None
if exclusion_rules is None
else [
rule.to_api_dict()
if hasattr(rule, "to_api_dict")
else rule
for rule in exclusion_rules
]
)
data = self.patch(payload, f"evaluationV2Presets/{preset_id}")
return EvaluationV2Preset.from_json(data, self)

def delete_evaluation_v2_preset(self, preset_id: str) -> None:
"""Delete a saved Evaluation V2 preset.

Parameters:
preset_id: Preset id (``prev_*``). Must be owned by the caller.
"""
self.make_request(
{},
f"evaluationV2Presets/{preset_id}",
requests_command=requests.delete,
return_raw_response=True,
)

@deprecated(msg="Prefer calling Dataset.info() directly.")
def dataset_info(self, dataset_id: str):
dataset = self.get_dataset(dataset_id)
Expand Down Expand Up @@ -1316,6 +1564,9 @@ def delete(self, route: str):
def get(self, route: str):
return self.connection.get(route)

def patch(self, payload: dict, route: str):
return self.connection.patch(payload, route)

def post(self, payload: dict, route: str):
return self.connection.post(payload, route)

Expand Down
Loading