From 02e6d874e0bfdaae48456b2ffa07a04ab6227dd7 Mon Sep 17 00:00:00 2001 From: Luke Schaefer Date: Thu, 25 Jun 2026 16:34:17 -0500 Subject: [PATCH 1/5] update sdk with new adds --- CHANGELOG.md | 16 + nucleus/__init__.py | 253 +++++++++++++- nucleus/connection.py | 5 + nucleus/data_transfer_object/evaluation_v2.py | 4 + nucleus/dataset.py | 15 + nucleus/evaluation_v2.py | 167 +++++++-- nucleus/evaluation_v2_exclusions.py | 112 ++++++ nucleus/evaluation_v2_preset.py | 111 ++++++ pyproject.toml | 2 +- tests/test_evaluation_v2.py | 196 +++++++++-- tests/test_evaluation_v2_presets.py | 327 ++++++++++++++++++ 11 files changed, 1146 insertions(+), 62 deletions(-) create mode 100644 nucleus/evaluation_v2_exclusions.py create mode 100644 nucleus/evaluation_v2_preset.py create mode 100644 tests/test_evaluation_v2_presets.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4212614c..56345247 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.18.9](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.9) - 2026-06-25 + +### Added +- **Evaluations V2 slice scoping and exclusion rules.** `create_evaluation_v2()` accepts `slice_id` (restrict the evaluation to a slice's items) and `exclusion_rules` (drop items/annotations before metrics are computed) via the new `MetadataExclusionRule`, `LabelExclusionRule`, and `BoxAreaExclusionRule` types (or equivalent dicts). The `EvaluationV2` resource exposes `slice_id`, `exclusion_rules`, and `exclusion_stats`. `EvaluationV2FilterArgs` gains `gt_area_range` (filter by ground-truth box area, e.g. COCO small/medium/large bands) and `slice_ids`, applied by both `charts()` and `examples()`. +- **Evaluation V2 presets.** Save and reuse evaluation configurations (`name` + `allowed_label_matches` + `exclusion_rules`) via `NucleusClient.list_evaluation_v2_presets()`, `create_evaluation_v2_preset()`, `update_evaluation_v2_preset()`, and `delete_evaluation_v2_preset()`, plus the new `EvaluationV2Preset` resource (with `update()` / `delete()`). Apply a preset directly when creating an evaluation: `create_evaluation_v2(model_run_id, preset=preset)` seeds the matches and rules (explicit arguments override the preset). +- `create_evaluation_v2()` accepts `only_items_with_predictions` to restrict the evaluation to items that have at least one prediction. +- **Batch create.** `create_evaluations_v2_batch()` creates one evaluation per `(model_run_id, slice_id)` pair with a shared configuration, running concurrently and returning a `BatchEvaluationResult` per job (capturing the created evaluation or the per-job error). +- **Cancel & retry.** `EvaluationV2.cancel()` stops a running evaluation; `EvaluationV2.retry()` re-runs a failed one, reusing its slice/matches/exclusion rules. +- `Dataset.evaluation_label_schema()` returns the dataset's ground-truth and prediction label vocabularies (`gt_labels` / `prediction_labels`) for building label matches and label exclusion rules. + +### Changed +- `EvaluationV2.examples()` now treats `match_type` as optional — omit it to return examples of all match types. + +### Fixed +- `EvaluationV2.charts()` issues a `POST` (matching the backend route) instead of a `GET` with a query string, which did not reach the server. + ## [0.18.8](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.18.8) - 2026-06-17 ### Fixed diff --git a/nucleus/__init__.py b/nucleus/__init__.py index 12e3c540..a8591c77 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -3,6 +3,7 @@ __all__ = [ "AsyncJob", "AllowedLabelMatch", + "BatchEvaluationResult", "EmbeddingsExportJob", "BoxAnnotation", "DeduplicationJob", @@ -24,7 +25,11 @@ "EvaluationV2ExamplesPage", "EvaluationV2FilterArgs", "EvaluationV2MatchExample", + "EvaluationV2Preset", "EvaluationV2Status", + "MetadataExclusionRule", + "LabelExclusionRule", + "BoxAreaExclusionRule", "Frame", "Keypoint", "KeypointsAnnotation", @@ -57,6 +62,7 @@ import datetime import os import warnings +from concurrent.futures import ThreadPoolExecutor, as_completed from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import requests @@ -161,7 +167,19 @@ NotFoundError, NucleusAPIError, ) -from .evaluation_v2 import AllowedLabelMatch, EvaluationV2, EvaluationV2Status +from .evaluation_v2 import ( + AllowedLabelMatch, + BatchEvaluationResult, + EvaluationV2, + EvaluationV2Status, +) +from .evaluation_v2_exclusions import ( + BoxAreaExclusionRule, + EvaluationV2ExclusionRule, + LabelExclusionRule, + MetadataExclusionRule, +) +from .evaluation_v2_preset import _UNSET, EvaluationV2Preset from .job import CustomerJobTypes from .local_deduplication import ( LocalDeduplicationResult, @@ -902,6 +920,12 @@ def create_evaluation_v2( name: Optional[str] = None, allowed_label_matches: Optional[List[AllowedLabelMatch]] = None, allowed_label_matches_id: Optional[str] = None, + slice_id: Optional[str] = None, + exclusion_rules: Optional[ + List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]] + ] = None, + only_items_with_predictions: bool = False, + preset: Optional[EvaluationV2Preset] = None, ) -> EvaluationV2: """Create an evaluation for a model run. @@ -914,10 +938,30 @@ def create_evaluation_v2( name: Optional display name. allowed_label_matches: Optional label pairs to treat as matches. allowed_label_matches_id: Optional id of a saved label-match configuration. + slice_id: Optional slice id (``slc_*``) to scope the evaluation to the + items in that slice. Must belong to the model run's dataset. + exclusion_rules: Optional rules that drop items/annotations before metrics + are computed. Each entry is a + :class:`~nucleus.evaluation_v2_exclusions.MetadataExclusionRule`, + :class:`~nucleus.evaluation_v2_exclusions.LabelExclusionRule`, or + :class:`~nucleus.evaluation_v2_exclusions.BoxAreaExclusionRule` + (or an equivalent plain dict). Per-rule validation happens server-side; + a malformed rule rejects the whole request with a descriptive error. + only_items_with_predictions: If ``True``, restrict the evaluation to + items that have at least one model prediction. + preset: Optional :class:`EvaluationV2Preset` whose + ``allowed_label_matches`` and ``exclusion_rules`` seed this + evaluation. Explicit ``allowed_label_matches`` / ``exclusion_rules`` + arguments take precedence over the preset's values. Returns: :class:`EvaluationV2`: The created evaluation. """ + if preset is not None: + if allowed_label_matches is None: + allowed_label_matches = preset.allowed_label_matches + if exclusion_rules is None and preset.exclusion_rules is not None: + exclusion_rules = list(preset.exclusion_rules) payload: Dict[str, Any] = {} if name is not None: payload["name"] = name @@ -927,6 +971,15 @@ def create_evaluation_v2( ] if allowed_label_matches_id is not None: payload["allowed_label_matches_id"] = allowed_label_matches_id + if slice_id is not None: + payload["sliceId"] = slice_id + if exclusion_rules is not None: + payload["exclusionRules"] = [ + rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule + for rule in exclusion_rules + ] + if only_items_with_predictions: + payload["onlyItemsWithPredictions"] = True result = self.make_request( payload, f"modelRun/{model_run_id}/evaluationsV2" ) @@ -937,6 +990,96 @@ def create_evaluation_v2( ) return self.get_evaluation_v2(str(eval_id)) + def create_evaluations_v2_batch( + self, + model_run_ids: List[str], + *, + slice_ids: Optional[List[Optional[str]]] = None, + name_prefix: Optional[str] = None, + allowed_label_matches: Optional[List[AllowedLabelMatch]] = None, + allowed_label_matches_id: Optional[str] = None, + exclusion_rules: Optional[ + List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]] + ] = None, + only_items_with_predictions: bool = False, + preset: Optional[EvaluationV2Preset] = None, + max_workers: int = 4, + ) -> List[BatchEvaluationResult]: + """Create many evaluations at once, sharing one configuration. + + Mirrors the batch-create flow in the UI: one evaluation is created for + every ``(model_run_id, slice_id)`` pair (the cross-product of + ``model_run_ids`` and ``slice_ids``), all sharing the same matches, + exclusion rules, and options. Jobs run concurrently and failures are + captured per job rather than aborting the batch. + + Parameters: + model_run_ids: Model run ids (``run_*``) to evaluate. + slice_ids: Slice ids (``slc_*``) to scope each evaluation to. Use + ``None`` within the list for a whole-dataset evaluation. Defaults + to ``[None]`` (whole dataset for every run). + name_prefix: Optional name prefix; the run id and/or slice id are + appended to keep batch names unique. + allowed_label_matches: Shared label-match pairs (see + :meth:`create_evaluation_v2`). + allowed_label_matches_id: Shared saved label-match config id. + exclusion_rules: Shared exclusion rules. + only_items_with_predictions: Shared "only items with predictions" flag. + preset: Optional preset seeding matches/rules for every job. + max_workers: Maximum concurrent create requests (default 4). + + Returns: + List of :class:`BatchEvaluationResult`, in input order — each holds + the created :class:`EvaluationV2` or the error for that job. + """ + if not model_run_ids: + return [] + targets: List[Optional[str]] = ( + list(slice_ids) if slice_ids is not None else [None] + ) + jobs: List[Tuple[str, Optional[str]]] = [ + (run, sl) for run in model_run_ids for sl in targets + ] + + def _name(run: str, sl: Optional[str]) -> Optional[str]: + if name_prefix is None: + return None + parts = [name_prefix] + if len(model_run_ids) > 1: + parts.append(run) + if sl is not None: + parts.append(sl) + return " — ".join(parts) + + results: List[Optional[BatchEvaluationResult]] = [None] * len(jobs) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_idx = { + executor.submit( + self.create_evaluation_v2, + run, + name=_name(run, sl), + allowed_label_matches=allowed_label_matches, + allowed_label_matches_id=allowed_label_matches_id, + slice_id=sl, + exclusion_rules=exclusion_rules, + only_items_with_predictions=only_items_with_predictions, + preset=preset, + ): idx + for idx, (run, sl) in enumerate(jobs) + } + for future in as_completed(future_to_idx): + idx = future_to_idx[future] + run, sl = jobs[idx] + result = BatchEvaluationResult( + model_run_id=run, slice_id=sl, name=_name(run, sl) + ) + try: + result.evaluation = future.result() + except Exception as exc: # noqa: BLE001 - reported per job + result.error = str(exc) + results[idx] = result + return [r for r in results if r is not None] + def get_evaluation_v2(self, evaluation_id: str) -> EvaluationV2: """Get an evaluation by id. @@ -965,6 +1108,111 @@ def list_evaluations_v2(self, model_run_id: str) -> List[EvaluationV2]: ) return [EvaluationV2.from_json(r, self) for r in rows] + def list_evaluation_v2_presets(self) -> List[EvaluationV2Preset]: + """List the current user's saved Evaluation V2 presets. + + Returns: + List of :class:`EvaluationV2Preset` (presets are private per user). + """ + rows = self.get("evaluationV2Presets") + if not isinstance(rows, list): + raise RuntimeError( + f"Unexpected list evaluation V2 presets response: {rows!r}" + ) + return [EvaluationV2Preset.from_json(r, self) for r in rows] + + def create_evaluation_v2_preset( + self, + name: str, + *, + allowed_label_matches: Optional[List[AllowedLabelMatch]] = None, + exclusion_rules: Optional[ + List[Union[EvaluationV2ExclusionRule, Dict[str, Any]]] + ] = None, + ) -> EvaluationV2Preset: + """Create a saved Evaluation V2 preset. + + Parameters: + name: Preset name. Must be non-empty and unique among the user's + presets. + allowed_label_matches: Optional label pairs to treat as matches. + exclusion_rules: Optional rules that drop items/annotations (same + types accepted by :meth:`create_evaluation_v2`). + + Returns: + :class:`EvaluationV2Preset`: The created preset. + """ + payload: Dict[str, Any] = {"name": name} + if allowed_label_matches is not None: + payload["allowedLabelMatches"] = [ + m.to_api_dict() for m in allowed_label_matches + ] + if exclusion_rules is not None: + payload["exclusionRules"] = [ + rule.to_api_dict() if hasattr(rule, "to_api_dict") else rule + for rule in exclusion_rules + ] + data = self.post(payload, "evaluationV2Presets") + return EvaluationV2Preset.from_json(data, self) + + def update_evaluation_v2_preset( + self, + preset_id: str, + *, + name: Any = _UNSET, + allowed_label_matches: Any = _UNSET, + exclusion_rules: Any = _UNSET, + ) -> EvaluationV2Preset: + """Update a saved Evaluation V2 preset. + + Only the fields you pass are changed. Passing ``exclusion_rules=None`` + clears the rules; omitting an argument leaves that field unchanged. + + Parameters: + preset_id: Preset id (``prev_*``). Must be owned by the caller. + name: Optional new name. + allowed_label_matches: Optional new label-match list. + exclusion_rules: Optional new exclusion rules, or ``None`` to clear. + + Returns: + :class:`EvaluationV2Preset`: The updated preset. + """ + payload: Dict[str, Any] = {} + if name is not _UNSET: + payload["name"] = name + if allowed_label_matches is not _UNSET: + payload["allowedLabelMatches"] = ( + None + if allowed_label_matches is None + else [m.to_api_dict() for m in allowed_label_matches] + ) + if exclusion_rules is not _UNSET: + payload["exclusionRules"] = ( + None + if exclusion_rules is None + else [ + rule.to_api_dict() + if hasattr(rule, "to_api_dict") + else rule + for rule in exclusion_rules + ] + ) + data = self.patch(payload, f"evaluationV2Presets/{preset_id}") + return EvaluationV2Preset.from_json(data, self) + + def delete_evaluation_v2_preset(self, preset_id: str) -> None: + """Delete a saved Evaluation V2 preset. + + Parameters: + preset_id: Preset id (``prev_*``). Must be owned by the caller. + """ + self.make_request( + {}, + f"evaluationV2Presets/{preset_id}", + requests_command=requests.delete, + return_raw_response=True, + ) + @deprecated(msg="Prefer calling Dataset.info() directly.") def dataset_info(self, dataset_id: str): dataset = self.get_dataset(dataset_id) @@ -1316,6 +1564,9 @@ def delete(self, route: str): def get(self, route: str): return self.connection.get(route) + def patch(self, payload: dict, route: str): + return self.connection.patch(payload, route) + def post(self, payload: dict, route: str): return self.connection.post(payload, route) diff --git a/nucleus/connection.py b/nucleus/connection.py index 467600a9..050f12ff 100644 --- a/nucleus/connection.py +++ b/nucleus/connection.py @@ -50,6 +50,11 @@ def delete(self, route: str): def get(self, route: str): return self.make_request({}, route, requests_command=requests.get) + def patch(self, payload: dict, route: str): + return self.make_request( + payload, route, requests_command=requests.patch + ) + def post(self, payload: dict, route: str): return self.make_request( payload, route, requests_command=requests.post diff --git a/nucleus/data_transfer_object/evaluation_v2.py b/nucleus/data_transfer_object/evaluation_v2.py index a1abb443..b2f11da7 100644 --- a/nucleus/data_transfer_object/evaluation_v2.py +++ b/nucleus/data_transfer_object/evaluation_v2.py @@ -43,9 +43,11 @@ class MetadataPredicate(DictCompatibleModel): "gt_labels": "gtLabels", "item_metadata": "itemMetadata", "prediction_metadata": "predictionMetadata", + "gt_area_range": "gtAreaRange", "label_equality": "labelEquality", "has_ground_truth": "hasGroundTruth", "tide_background": "tideBackground", + "slice_ids": "sliceIds", } @@ -58,9 +60,11 @@ class EvaluationV2FilterArgs(DictCompatibleModel): gt_labels: Optional[List[str]] = None item_metadata: Optional[List[MetadataPredicate]] = None prediction_metadata: Optional[List[MetadataPredicate]] = None + gt_area_range: Optional[RangeNum] = None label_equality: Optional[Literal["EQ", "NEQ"]] = None has_ground_truth: Optional[bool] = None tide_background: Optional[bool] = None + slice_ids: Optional[List[str]] = None def to_api_filters(self) -> Dict[str, Any]: """Return filters as a dict ready for API requests.""" diff --git a/nucleus/dataset.py b/nucleus/dataset.py index f5efee54..6f705c81 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -228,6 +228,21 @@ def slices(self) -> List[Slice]: ) return [Slice.from_request(info, self._client) for info in response] + def evaluation_label_schema(self) -> Dict[str, List[str]]: + """Ground-truth and prediction label vocabularies for this dataset. + + Useful for building :meth:`NucleusClient.create_evaluation_v2` + ``allowed_label_matches`` and label exclusion rules without guessing + label names. Mirrors the label lists shown in the Create Evaluation UI. + + Returns: + A dict with ``"gt_labels"`` (ground-truth annotation labels) and + ``"prediction_labels"`` (model prediction labels). + """ + return self._client.make_request( + {}, f"dataset/{self.id}/labelSchema", requests.get + ) + @property def embedding_indexes(self) -> List[EmbeddingIndex]: """Gets all the embedding indexes belonging to this Dataset.""" diff --git a/nucleus/evaluation_v2.py b/nucleus/evaluation_v2.py index 43f8a03c..aa90281b 100644 --- a/nucleus/evaluation_v2.py +++ b/nucleus/evaluation_v2.py @@ -7,7 +7,6 @@ from dataclasses import dataclass, field from enum import Enum from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union -from urllib.parse import urlencode import requests @@ -37,6 +36,23 @@ class EvaluationV2Status(str, Enum): } +def _parse_json_field(value: Any) -> Optional[Any]: + """Normalize a JSONB column that may arrive as a string or already parsed. + + The REST ``GET``/``LIST`` evaluation endpoints return raw DB rows, so the + ``exclusion_rules`` / ``exclusion_stats`` jsonb columns can come back either + already decoded (dict/list) or as a JSON string depending on the driver. + """ + if value is None or isinstance(value, (dict, list)): + return value + if isinstance(value, str): + try: + return json.loads(value) + except (ValueError, TypeError): + return None + return value + + @dataclass class AllowedLabelMatch: """Ground-truth and prediction label pair that counts as a match.""" @@ -51,6 +67,56 @@ def to_api_dict(self) -> Dict[str, str]: } +def parse_allowed_label_matches( + raw_matches: Any, +) -> Optional[List[AllowedLabelMatch]]: + """Parse an ``allowed_label_matches`` array from an API payload. + + Accepts both the camelCase (``groundTruthLabel`` / ``modelPredictionLabel``) + and snake_case shapes the backend may return, and drops malformed entries. + """ + if not isinstance(raw_matches, list): + return None + matches: List[AllowedLabelMatch] = [] + for m in raw_matches: + if not isinstance(m, dict): + continue + gt = m.get("groundTruthLabel") + if gt is None: + gt = m.get("ground_truth_label") + mp = m.get("modelPredictionLabel") + if mp is None: + mp = m.get("model_prediction_label") + if gt is not None and mp is not None: + matches.append( + AllowedLabelMatch( + ground_truth_label=str(gt), + model_prediction_label=str(mp), + ) + ) + return matches + + +@dataclass +class BatchEvaluationResult: + """Outcome of one job in a batch create call. + + ``evaluation`` is set on success; ``error`` holds the error message on + failure. Use :attr:`succeeded` to filter, and re-run the failed jobs by + feeding their ``model_run_id`` / ``slice_id`` back into a new batch call. + """ + + model_run_id: str + slice_id: Optional[str] = None + name: Optional[str] = None + evaluation: Optional["EvaluationV2"] = None + error: Optional[str] = None + + @property + def succeeded(self) -> bool: + return self.evaluation is not None + + @dataclass class EvaluationV2: """An Evaluation V2 run for a model run.""" @@ -66,6 +132,9 @@ class EvaluationV2: allowed_label_matches_id: Optional[str] = None allowed_label_matches: Optional[List[AllowedLabelMatch]] = None allowed_label_matches_name: Optional[str] = None + slice_id: Optional[str] = None + exclusion_rules: Optional[List[Dict[str, Any]]] = None + exclusion_stats: Optional[Dict[str, Any]] = None _client: Optional["NucleusClient"] = field(repr=False, default=None) @classmethod @@ -74,26 +143,9 @@ def from_json( payload: Dict[str, Any], client: Optional["NucleusClient"] = None, ) -> "EvaluationV2": - raw_matches = payload.get("allowed_label_matches") - matches: Optional[List[AllowedLabelMatch]] = None - if isinstance(raw_matches, list): - matches = [] - for m in raw_matches: - if not isinstance(m, dict): - continue - gt = m.get("groundTruthLabel") - if gt is None: - gt = m.get("ground_truth_label") - mp = m.get("modelPredictionLabel") - if mp is None: - mp = m.get("model_prediction_label") - if gt is not None and mp is not None: - matches.append( - AllowedLabelMatch( - ground_truth_label=str(gt), - model_prediction_label=str(mp), - ) - ) + matches = parse_allowed_label_matches( + payload.get("allowed_label_matches") + ) return cls( id=str(payload["id"]), @@ -109,6 +161,9 @@ def from_json( allowed_label_matches_name=payload.get( "allowed_label_matches_name" ), + slice_id=payload.get("slice_id"), + exclusion_rules=_parse_json_field(payload.get("exclusion_rules")), + exclusion_stats=_parse_json_field(payload.get("exclusion_stats")), _client=client, ) @@ -170,6 +225,45 @@ def delete(self) -> None: return_raw_response=True, ) + def cancel(self) -> "EvaluationV2": + """Cancel this evaluation if it is still running. + + Stops the evaluation and sets its status to ``cancelled``. Finished + evaluations cannot be cancelled (use :meth:`delete` to archive them). + + Returns: + self, refreshed with the post-cancel status. + """ + if self._client is None: + raise RuntimeError("EvaluationV2 has no client.") + self._client.make_request( + {}, + f"evaluationsV2/{self.id}/cancel", + requests_command=requests.post, + return_raw_response=True, + ) + return self.refresh() + + def retry(self) -> "EvaluationV2": + """Retry this evaluation if it failed. + + Creates a new evaluation for the same model run, reusing this + evaluation's slice, allowed-label-matches, and exclusion rules. Only + ``failed`` evaluations can be retried. + + Returns: + :class:`EvaluationV2`: The newly created (retry) evaluation. + """ + if self._client is None: + raise RuntimeError("EvaluationV2 has no client.") + result = self._client.post({}, f"evaluationsV2/{self.id}/retry") + eval_id = result.get("evaluation_id") + if not eval_id: + raise RuntimeError( + f"Unexpected retry evaluation V2 response: {result}" + ) + return self._client.get_evaluation_v2(str(eval_id)) + def charts( self, iou_threshold: float = 0.5, @@ -190,24 +284,20 @@ def charts( """ if self._client is None: raise RuntimeError("EvaluationV2 has no client.") - params: Dict[str, str] = {} - params["iouThreshold"] = str(iou_threshold) + payload: Dict[str, Any] = {"iouThreshold": iou_threshold} if filters is not None: if isinstance(filters, EvaluationV2FilterArgs): - filt_dict = filters.to_api_filters() + payload["filters"] = filters.to_api_filters() else: - filt_dict = filters - params["filters"] = json.dumps(filt_dict) + payload["filters"] = filters if query: - params["query"] = query - qs = urlencode(params) - route = f"evaluationsV2/{self.id}/charts?{qs}" - data = self._client.get(route) + payload["query"] = query + data = self._client.post(payload, f"evaluationsV2/{self.id}/charts") return EvaluationV2Charts.parse_obj(data) def examples( self, - match_type: str, + match_type: Optional[str] = None, limit: int = 50, offset: int = 0, sort_by: Optional[str] = None, @@ -217,14 +307,16 @@ def examples( ] = None, query: Optional[str] = None, ) -> EvaluationV2ExamplesPage: - """Return paginated true-positive, false-positive, or false-negative examples. + """Return paginated match examples, optionally filtered by match type. Parameters: - match_type: ``"TP"``, ``"FP"``, or ``"FN"``. - limit: Page size (default 50). + match_type: ``"TP"``, ``"FP"``, or ``"FN"``. Omit (or ``None``) to + return examples of all match types. + limit: Page size (default 50, max 100). offset: Row offset for pagination. - sort_by: Optional field to sort by. - sort_order: Optional sort direction (e.g. ``"asc"`` or ``"desc"``). + sort_by: Optional field to sort by — one of ``"confidence"``, + ``"iou"``, ``"dataset_item_id"``, ``"gt_area"``. + sort_order: Optional sort direction (``"ASC"`` or ``"DESC"``). filters: Optional filters (:class:`EvaluationV2FilterArgs` or dict). query: Optional query string to narrow results. @@ -234,10 +326,11 @@ def examples( if self._client is None: raise RuntimeError("EvaluationV2 has no client.") payload: Dict[str, Any] = { - "match_type": match_type, "limit": limit, "offset": offset, } + if match_type is not None: + payload["match_type"] = match_type if sort_by is not None: payload["sort_by"] = sort_by if sort_order is not None: diff --git a/nucleus/evaluation_v2_exclusions.py b/nucleus/evaluation_v2_exclusions.py new file mode 100644 index 00000000..c529f2e1 --- /dev/null +++ b/nucleus/evaluation_v2_exclusions.py @@ -0,0 +1,112 @@ +"""Exclusion rules for Evaluation V2 creation. + +Mirrors ``packages/shared/src/nucleus/evaluationV2Exclusions.ts`` on the backend. +These rules drop items/annotations from an evaluation before metrics are computed. + +The per-rule shape is validated server-side at create time +(``parseEvaluationV2ExclusionRulesWithDiagnostics``), which reports exactly which +rules were rejected and why — so these classes only need to serialize correctly. + +Pass instances (or equivalent plain dicts) to +:meth:`nucleus.NucleusClient.create_evaluation_v2` via ``exclusion_rules``:: + + client.create_evaluation_v2( + model_run_id, + exclusion_rules=[ + BoxAreaExclusionRule(scope="annotation", target="groundTruth", min=1024), + LabelExclusionRule(scope="item", target="prediction", labels=["ignore"]), + MetadataExclusionRule(key="is_dark", op="EQ", value=True), + ], + ) +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Union + +# String literals are sent as values (not keys), so the server's request-body +# camelcaser preserves them verbatim — emit them exactly as the backend expects. +ExclusionScope = str # "item" | "annotation" +ExclusionTarget = str # "groundTruth" | "prediction" +MetadataOp = str # "EQ" | "IN" | "GT" | "LT" + + +@dataclass +class MetadataExclusionRule: + """Exclude whole items whose item-metadata ``key`` matches ``value`` under ``op``. + + ``scope`` is always ``"item"`` for metadata rules. + """ + + key: str + op: MetadataOp + value: Any + scope: ExclusionScope = "item" + + def to_api_dict(self) -> Dict[str, Any]: + return { + "type": "metadata", + "scope": self.scope, + "key": self.key, + "op": self.op, + "value": self.value, + } + + +@dataclass +class LabelExclusionRule: + """Exclude annotations/predictions (or whole items) carrying any of ``labels``. + + Parameters: + scope: ``"item"`` (drop the whole item if any annotation matches) or + ``"annotation"`` (drop only matching annotations). + target: ``"groundTruth"`` or ``"prediction"`` — which side to filter. + labels: Labels to exclude. + """ + + scope: ExclusionScope + target: ExclusionTarget + labels: List[str] = field(default_factory=list) + + def to_api_dict(self) -> Dict[str, Any]: + return { + "type": "labels", + "scope": self.scope, + "target": self.target, + "labels": list(self.labels), + } + + +@dataclass +class BoxAreaExclusionRule: + """Exclude boxes whose pixel area falls outside ``[min, max]`` (at least one bound required). + + Parameters: + scope: ``"item"`` or ``"annotation"``. + target: ``"groundTruth"`` or ``"prediction"``. + min: Minimum pixel area (inclusive lower bound), or ``None``. + max: Maximum pixel area (inclusive upper bound), or ``None``. + """ + + scope: ExclusionScope + target: ExclusionTarget + min: Optional[float] = None + max: Optional[float] = None + + def to_api_dict(self) -> Dict[str, Any]: + out: Dict[str, Any] = { + "type": "boxArea", + "scope": self.scope, + "target": self.target, + } + if self.min is not None: + out["min"] = self.min + if self.max is not None: + out["max"] = self.max + return out + + +EvaluationV2ExclusionRule = Union[ + MetadataExclusionRule, + LabelExclusionRule, + BoxAreaExclusionRule, +] diff --git a/nucleus/evaluation_v2_preset.py b/nucleus/evaluation_v2_preset.py new file mode 100644 index 00000000..36bf3630 --- /dev/null +++ b/nucleus/evaluation_v2_preset.py @@ -0,0 +1,111 @@ +"""Evaluation V2 presets — saved, reusable evaluation configurations. + +A preset bundles a ``name`` with ``allowed_label_matches`` and ``exclusion_rules`` +so the same configuration can be applied across many evaluations. Presets are +private to the creating user. + +Mirrors the ``/v1/nucleus/evaluationV2Presets`` REST endpoints on the backend. +Create and manage presets via :class:`~nucleus.NucleusClient`:: + + preset = client.create_evaluation_v2_preset( + "vehicles", + allowed_label_matches=[AllowedLabelMatch("car", "vehicle")], + exclusion_rules=[LabelExclusionRule(scope="item", target="prediction", labels=["ignore"])], + ) + client.create_evaluation_v2(model_run_id, preset=preset) +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from nucleus.evaluation_v2 import ( + AllowedLabelMatch, + _parse_json_field, + parse_allowed_label_matches, +) + +if TYPE_CHECKING: + from nucleus import NucleusClient + + +# Sentinel distinguishing "argument omitted" from an explicit ``None`` (which, +# for ``exclusion_rules`` on update, means "clear the rules"). +class _Unset: + def __repr__(self) -> str: # pragma: no cover - cosmetic + return "" + + +_UNSET = _Unset() + + +@dataclass +class EvaluationV2Preset: + """A saved Evaluation V2 configuration owned by the current user.""" + + id: str + name: str + allowed_label_matches: Optional[List[AllowedLabelMatch]] = None + exclusion_rules: Optional[List[Dict[str, Any]]] = None + created_by_user_id: Optional[str] = None + created_at: Optional[str] = None + updated_at: Optional[str] = None + deleted_at: Optional[str] = None + _client: Optional["NucleusClient"] = field(repr=False, default=None) + + @classmethod + def from_json( + cls, + payload: Dict[str, Any], + client: Optional["NucleusClient"] = None, + ) -> "EvaluationV2Preset": + return cls( + id=str(payload["id"]), + name=str(payload["name"]), + allowed_label_matches=parse_allowed_label_matches( + payload.get("allowed_label_matches") + ), + exclusion_rules=_parse_json_field(payload.get("exclusion_rules")), + created_by_user_id=payload.get("created_by_user_id"), + created_at=payload.get("created_at"), + updated_at=payload.get("updated_at"), + deleted_at=payload.get("deleted_at"), + _client=client, + ) + + def update( + self, + *, + name: Any = _UNSET, + allowed_label_matches: Any = _UNSET, + exclusion_rules: Any = _UNSET, + ) -> "EvaluationV2Preset": + """Update this preset in place. + + Only the arguments you pass are changed. Passing + ``exclusion_rules=None`` clears the rules; omitting it leaves them + unchanged. + + Returns: + self, with updated fields. + """ + if self._client is None: + raise RuntimeError( + "EvaluationV2Preset has no client; fetch it via " + "NucleusClient.list_evaluation_v2_presets." + ) + updated = self._client.update_evaluation_v2_preset( + self.id, + name=name, + allowed_label_matches=allowed_label_matches, + exclusion_rules=exclusion_rules, + ) + self.__dict__.update(updated.__dict__) + return self + + def delete(self) -> None: + """Delete this preset.""" + if self._client is None: + raise RuntimeError("EvaluationV2Preset has no client.") + self._client.delete_evaluation_v2_preset(self.id) diff --git a/pyproject.toml b/pyproject.toml index 794811da..57004bfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running [tool.poetry] name = "scale-nucleus" -version = "0.18.8" +version = "0.18.9" description = "The official Python client library for Nucleus, the Data Platform for AI" license = "MIT" authors = ["Scale AI Nucleus Team "] diff --git a/tests/test_evaluation_v2.py b/tests/test_evaluation_v2.py index 829e34a2..01241d58 100644 --- a/tests/test_evaluation_v2.py +++ b/tests/test_evaluation_v2.py @@ -5,7 +5,14 @@ import pytest import requests -from nucleus import AllowedLabelMatch, EvaluationV2, NucleusClient +from nucleus import ( + AllowedLabelMatch, + BoxAreaExclusionRule, + EvaluationV2, + LabelExclusionRule, + MetadataExclusionRule, + NucleusClient, +) from nucleus.data_transfer_object.evaluation_v2 import ( EvaluationV2Charts, EvaluationV2FilterArgs, @@ -15,6 +22,28 @@ ) +def _charts_response(): + return { + "mapSummary": {"mapAt50": 0.1, "mapAt75": 0.2, "mapAt5095": 0.15}, + "perClassAp": [], + "confusionMatrix": [], + "scoreHistogram": [], + "computedIouRanges": [], + "totalCounts": {"tp": 0, "fp": 0, "fn": 0, "predsWithConfidence": 0}, + "apBySize": {"small": None, "medium": None, "large": None}, + "prCurve": [], + "tideAttribution": { + "truePositive": 0, + "localization": 0, + "classification": 0, + "both": 0, + "duplicate": 0, + "background": 0, + "missed": 0, + }, + } + + def test_evaluation_v2_filter_args_to_api_filters(): filters = EvaluationV2FilterArgs( confidence_range=RangeNum(min=0.1, max=0.9), @@ -134,27 +163,122 @@ def test_create_evaluation_v2_then_get(): client.connection.get.assert_called_once_with("evaluationsV2/evalv2_new") -def test_charts_get_query_string(): - client = MagicMock(spec=NucleusClient) - client.get.return_value = { - "mapSummary": {"mapAt50": 0.1, "mapAt75": 0.2, "mapAt5095": 0.15}, - "perClassAp": [], - "confusionMatrix": [], - "scoreHistogram": [], - "computedIouRanges": [], - "totalCounts": {"tp": 0, "fp": 0, "fn": 0, "predsWithConfidence": 0}, - "apBySize": {"small": None, "medium": None, "large": None}, - "prCurve": [], - "tideAttribution": { - "truePositive": 0, - "localization": 0, - "classification": 0, - "both": 0, - "duplicate": 0, - "background": 0, - "missed": 0, +def test_create_evaluation_v2_with_slice_and_exclusion_rules(): + client = NucleusClient(api_key="test") + client.connection.make_request = MagicMock( + return_value={"evaluation_id": "evalv2_new", "status": "pending"} + ) + client.connection.get = MagicMock( + return_value={ + "id": "evalv2_new", + "model_run_id": "run_1", + "dataset_id": "ds_1", + "status": "pending", + } + ) + client.create_evaluation_v2( + "run_1", + slice_id="slc_x", + exclusion_rules=[ + BoxAreaExclusionRule( + scope="annotation", target="groundTruth", min=1024 + ), + LabelExclusionRule( + scope="item", target="prediction", labels=["ignore"] + ), + MetadataExclusionRule(key="is_dark", op="EQ", value=True), + { + "type": "labels", + "scope": "item", + "target": "groundTruth", + "labels": ["x"], + }, + ], + ) + payload = client.connection.make_request.call_args[0][0] + assert payload["sliceId"] == "slc_x" + assert payload["exclusionRules"] == [ + { + "type": "boxArea", + "scope": "annotation", + "target": "groundTruth", + "min": 1024, + }, + { + "type": "labels", + "scope": "item", + "target": "prediction", + "labels": ["ignore"], + }, + { + "type": "metadata", + "scope": "item", + "key": "is_dark", + "op": "EQ", + "value": True, }, + { + "type": "labels", + "scope": "item", + "target": "groundTruth", + "labels": ["x"], + }, + ] + + +def test_evaluation_v2_filter_args_gt_area_and_slices(): + filters = EvaluationV2FilterArgs( + gt_area_range=RangeNum(min=1024, max=9216), + slice_ids=["slc_a"], + ) + assert filters.to_api_filters() == { + "gtAreaRange": {"min": 1024.0, "max": 9216.0}, + "sliceIds": ["slc_a"], } + + +def test_evaluation_v2_from_json_slice_and_exclusions(): + # exclusion_rules as a JSON string (raw jsonb), exclusion_stats as a dict. + ev = EvaluationV2.from_json( + { + "id": "evalv2_1", + "model_run_id": "run_1", + "dataset_id": "ds_1", + "status": "succeeded", + "slice_id": "slc_x", + "exclusion_rules": '[{"type":"labels","scope":"item","target":"prediction","labels":["ignore"]}]', + "exclusion_stats": {"totals": {"itemsDropped": 3}}, + } + ) + assert ev.slice_id == "slc_x" + assert ev.exclusion_rules == [ + { + "type": "labels", + "scope": "item", + "target": "prediction", + "labels": ["ignore"], + } + ] + assert ev.exclusion_stats == {"totals": {"itemsDropped": 3}} + + +def test_evaluation_v2_from_json_exclusions_absent(): + ev = EvaluationV2.from_json( + { + "id": "evalv2_1", + "model_run_id": "run_1", + "dataset_id": "ds_1", + "status": "succeeded", + } + ) + assert ev.slice_id is None + assert ev.exclusion_rules is None + assert ev.exclusion_stats is None + + +def test_charts_post_body(): + client = MagicMock(spec=NucleusClient) + client.post.return_value = _charts_response() ev = EvaluationV2( id="evalv2_1", model_run_id="run_1", @@ -164,9 +288,35 @@ def test_charts_get_query_string(): ) charts = ev.charts(iou_threshold=0.5) assert isinstance(charts, EvaluationV2Charts) - call_route = client.get.call_args[0][0] - assert "evaluationsV2/evalv2_1/charts" in call_route - assert "iouThreshold=0.5" in call_route + client.post.assert_called_once() + payload, route = client.post.call_args[0] + assert route == "evaluationsV2/evalv2_1/charts" + assert payload == {"iouThreshold": 0.5} + + +def test_charts_with_filter_args(): + client = MagicMock(spec=NucleusClient) + client.post.return_value = _charts_response() + ev = EvaluationV2( + id="evalv2_1", + model_run_id="run_1", + dataset_id="ds_1", + status="succeeded", + _client=client, + ) + filters = EvaluationV2FilterArgs( + gt_area_range=RangeNum(min=1024), + slice_ids=["slc_a", "slc_b"], + ) + ev.charts(iou_threshold=0.75, filters=filters, query="dog") + payload, route = client.post.call_args[0] + assert route == "evaluationsV2/evalv2_1/charts" + assert payload["iouThreshold"] == 0.75 + assert payload["query"] == "dog" + assert payload["filters"] == { + "gtAreaRange": {"min": 1024.0}, + "sliceIds": ["slc_a", "slc_b"], + } def test_examples_post_body(): diff --git a/tests/test_evaluation_v2_presets.py b/tests/test_evaluation_v2_presets.py new file mode 100644 index 00000000..f6acce30 --- /dev/null +++ b/tests/test_evaluation_v2_presets.py @@ -0,0 +1,327 @@ +"""Unit tests for Evaluation V2 presets, batch create, cancel/retry, and +label-schema discovery (no live API).""" + +from unittest.mock import MagicMock + +import requests + +from nucleus import ( + AllowedLabelMatch, + EvaluationV2, + EvaluationV2Preset, + LabelExclusionRule, + NucleusClient, +) +from nucleus.dataset import Dataset + + +# --------------------------------------------------------------------------- # +# Preset CRUD +# --------------------------------------------------------------------------- # +def test_list_evaluation_v2_presets(): + client = NucleusClient(api_key="test") + client.connection.get = MagicMock( + return_value=[ + { + "id": "prev_1", + "name": "vehicles", + "allowed_label_matches": [ + {"groundTruthLabel": "car", "modelPredictionLabel": "vehicle"} + ], + "exclusion_rules": None, + "created_by_user_id": "u_1", + } + ] + ) + presets = client.list_evaluation_v2_presets() + client.connection.get.assert_called_once_with("evaluationV2Presets") + assert len(presets) == 1 + assert presets[0].id == "prev_1" + assert presets[0].name == "vehicles" + assert presets[0].allowed_label_matches[0] == AllowedLabelMatch( + ground_truth_label="car", model_prediction_label="vehicle" + ) + + +def test_create_evaluation_v2_preset_payload(): + client = NucleusClient(api_key="test") + client.connection.post = MagicMock( + return_value={ + "id": "prev_1", + "name": "vehicles", + "allowed_label_matches": [], + "exclusion_rules": None, + } + ) + preset = client.create_evaluation_v2_preset( + "vehicles", + allowed_label_matches=[AllowedLabelMatch("car", "vehicle")], + exclusion_rules=[ + LabelExclusionRule( + scope="item", target="prediction", labels=["ignore"] + ) + ], + ) + payload, route = client.connection.post.call_args[0] + assert route == "evaluationV2Presets" + assert payload["name"] == "vehicles" + assert payload["allowedLabelMatches"] == [ + {"ground_truth_label": "car", "model_prediction_label": "vehicle"} + ] + assert payload["exclusionRules"] == [ + { + "type": "labels", + "scope": "item", + "target": "prediction", + "labels": ["ignore"], + } + ] + assert preset.id == "prev_1" + + +def test_update_evaluation_v2_preset_name_only_omits_other_fields(): + client = NucleusClient(api_key="test") + client.connection.patch = MagicMock( + return_value={"id": "prev_1", "name": "renamed"} + ) + client.update_evaluation_v2_preset("prev_1", name="renamed") + payload, route = client.connection.patch.call_args[0] + assert route == "evaluationV2Presets/prev_1" + # Only the provided field is sent; matches/rules untouched. + assert payload == {"name": "renamed"} + + +def test_update_evaluation_v2_preset_clear_rules_sends_null(): + client = NucleusClient(api_key="test") + client.connection.patch = MagicMock( + return_value={"id": "prev_1", "name": "p"} + ) + client.update_evaluation_v2_preset("prev_1", exclusion_rules=None) + payload = client.connection.patch.call_args[0][0] + # Explicit None clears the rules (distinct from "leave unchanged"). + assert payload == {"exclusionRules": None} + + +def test_delete_evaluation_v2_preset(): + client = NucleusClient(api_key="test") + client.connection.make_request = MagicMock(return_value=MagicMock()) + client.delete_evaluation_v2_preset("prev_1") + # NucleusClient.make_request forwards args positionally to the connection: + # (payload, route, requests_command, return_raw_response). + args = client.connection.make_request.call_args[0] + assert args[1] == "evaluationV2Presets/prev_1" + assert args[2] is requests.delete + + +def test_preset_instance_update_and_delete_delegate_to_client(): + client = MagicMock(spec=NucleusClient) + preset = EvaluationV2Preset(id="prev_1", name="p", _client=client) + client.update_evaluation_v2_preset.return_value = EvaluationV2Preset( + id="prev_1", name="renamed", _client=client + ) + preset.update(name="renamed") + assert preset.name == "renamed" + preset.delete() + client.delete_evaluation_v2_preset.assert_called_once_with("prev_1") + + +# --------------------------------------------------------------------------- # +# Apply preset + only_items_with_predictions on create +# --------------------------------------------------------------------------- # +def _stub_create(client): + client.connection.make_request = MagicMock( + return_value={"evaluation_id": "evalv2_new"} + ) + client.connection.get = MagicMock( + return_value={ + "id": "evalv2_new", + "model_run_id": "run_1", + "dataset_id": "ds_1", + "status": "pending", + } + ) + + +def test_create_evaluation_v2_with_preset_seeds_config(): + client = NucleusClient(api_key="test") + _stub_create(client) + preset = EvaluationV2Preset( + id="prev_1", + name="p", + allowed_label_matches=[AllowedLabelMatch("car", "vehicle")], + exclusion_rules=[ + { + "type": "labels", + "scope": "item", + "target": "groundTruth", + "labels": ["x"], + } + ], + ) + client.create_evaluation_v2("run_1", preset=preset) + payload = client.connection.make_request.call_args[0][0] + assert payload["allowed_label_matches"] == [ + {"ground_truth_label": "car", "model_prediction_label": "vehicle"} + ] + assert payload["exclusionRules"] == [ + { + "type": "labels", + "scope": "item", + "target": "groundTruth", + "labels": ["x"], + } + ] + + +def test_create_evaluation_v2_explicit_args_override_preset(): + client = NucleusClient(api_key="test") + _stub_create(client) + preset = EvaluationV2Preset( + id="prev_1", + name="p", + allowed_label_matches=[AllowedLabelMatch("car", "vehicle")], + ) + client.create_evaluation_v2( + "run_1", + preset=preset, + allowed_label_matches=[AllowedLabelMatch("dog", "animal")], + ) + payload = client.connection.make_request.call_args[0][0] + assert payload["allowed_label_matches"] == [ + {"ground_truth_label": "dog", "model_prediction_label": "animal"} + ] + + +def test_create_evaluation_v2_only_items_with_predictions(): + client = NucleusClient(api_key="test") + _stub_create(client) + client.create_evaluation_v2("run_1", only_items_with_predictions=True) + payload = client.connection.make_request.call_args[0][0] + assert payload["onlyItemsWithPredictions"] is True + + +# --------------------------------------------------------------------------- # +# Batch create +# --------------------------------------------------------------------------- # +def test_create_evaluations_v2_batch_cross_product_and_error_capture(): + client = NucleusClient(api_key="test") + seen = [] + + def fake_create(run, **kwargs): + seen.append((run, kwargs.get("slice_id"), kwargs.get("name"))) + if run == "run_bad": + raise RuntimeError("boom") + ev = MagicMock(spec=EvaluationV2) + ev.id = f"eval_{run}_{kwargs.get('slice_id')}" + return ev + + client.create_evaluation_v2 = fake_create + results = client.create_evaluations_v2_batch( + ["run_ok", "run_bad"], + slice_ids=["slc_1", None], + name_prefix="nightly", + ) + + # 2 runs x 2 targets = 4 jobs, returned in input order. + assert len(results) == 4 + assert results[0].model_run_id == "run_ok" + assert results[0].slice_id == "slc_1" + assert results[0].name == "nightly — run_ok — slc_1" + assert results[0].succeeded + assert results[1].name == "nightly — run_ok" # whole-dataset job + # Failures are captured per-job, not raised. + assert results[2].model_run_id == "run_bad" + assert not results[2].succeeded + assert results[2].error == "boom" + + +def test_create_evaluations_v2_batch_defaults_to_whole_dataset(): + client = NucleusClient(api_key="test") + client.create_evaluation_v2 = MagicMock( + return_value=MagicMock(spec=EvaluationV2) + ) + results = client.create_evaluations_v2_batch(["run_1", "run_2"]) + assert len(results) == 2 + # No slice_ids -> one whole-dataset job per run. + for call in client.create_evaluation_v2.call_args_list: + assert call.kwargs["slice_id"] is None + + +# --------------------------------------------------------------------------- # +# Cancel / retry +# --------------------------------------------------------------------------- # +def _eval(client, status="computing"): + return EvaluationV2( + id="evalv2_1", + model_run_id="run_1", + dataset_id="ds_1", + status=status, + _client=client, + ) + + +def test_evaluation_cancel_posts_and_refreshes(): + client = MagicMock(spec=NucleusClient) + client.get.return_value = { + "id": "evalv2_1", + "model_run_id": "run_1", + "dataset_id": "ds_1", + "status": "cancelled", + } + ev = _eval(client) + ev.cancel() + args, kwargs = client.make_request.call_args + assert args[1] == "evaluationsV2/evalv2_1/cancel" + assert kwargs["requests_command"] is requests.post + assert ev.status == "cancelled" + + +def test_evaluation_retry_resolves_new_evaluation(): + client = MagicMock(spec=NucleusClient) + client.post.return_value = {"evaluation_id": "evalv2_retry"} + client.get_evaluation_v2.return_value = EvaluationV2( + id="evalv2_retry", + model_run_id="run_1", + dataset_id="ds_1", + status="pending", + _client=client, + ) + ev = _eval(client, status="failed") + new_eval = ev.retry() + _, route = client.post.call_args[0] + assert route == "evaluationsV2/evalv2_1/retry" + assert new_eval.id == "evalv2_retry" + client.get_evaluation_v2.assert_called_once_with("evalv2_retry") + + +# --------------------------------------------------------------------------- # +# Examples optional match_type +# --------------------------------------------------------------------------- # +def test_examples_match_type_optional(): + client = MagicMock(spec=NucleusClient) + client.post.return_value = {"rows": [], "total": 0} + ev = _eval(client, status="succeeded") + + ev.examples() + payload = client.post.call_args[0][0] + assert "match_type" not in payload + + ev.examples(match_type="FP") + payload2 = client.post.call_args[0][0] + assert payload2["match_type"] == "FP" + + +# --------------------------------------------------------------------------- # +# Label schema discovery +# --------------------------------------------------------------------------- # +def test_dataset_evaluation_label_schema(): + client = NucleusClient(api_key="test") + client.connection.make_request = MagicMock( + return_value={"gt_labels": ["car"], "prediction_labels": ["vehicle"]} + ) + dataset = Dataset("ds_1", client) + out = dataset.evaluation_label_schema() + assert out == {"gt_labels": ["car"], "prediction_labels": ["vehicle"]} + args = client.connection.make_request.call_args[0] + assert args[1] == "dataset/ds_1/labelSchema" + assert args[2] is requests.get From 3a29f12aef3dd629223ede9db06842f1af9927b3 Mon Sep 17 00:00:00 2001 From: Luke Schaefer Date: Thu, 25 Jun 2026 16:45:09 -0500 Subject: [PATCH 2/5] remove verbose comment --- nucleus/__init__.py | 2 +- nucleus/dataset.py | 2 +- nucleus/evaluation_v2_exclusions.py | 1 - nucleus/evaluation_v2_preset.py | 1 - 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/nucleus/__init__.py b/nucleus/__init__.py index a8591c77..8e098786 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -1007,7 +1007,7 @@ def create_evaluations_v2_batch( ) -> List[BatchEvaluationResult]: """Create many evaluations at once, sharing one configuration. - Mirrors the batch-create flow in the UI: one evaluation is created for + One evaluation is created for every ``(model_run_id, slice_id)`` pair (the cross-product of ``model_run_ids`` and ``slice_ids``), all sharing the same matches, exclusion rules, and options. Jobs run concurrently and failures are diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 6f705c81..f7077798 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -233,7 +233,7 @@ def evaluation_label_schema(self) -> Dict[str, List[str]]: Useful for building :meth:`NucleusClient.create_evaluation_v2` ``allowed_label_matches`` and label exclusion rules without guessing - label names. Mirrors the label lists shown in the Create Evaluation UI. + label names. Returns: A dict with ``"gt_labels"`` (ground-truth annotation labels) and diff --git a/nucleus/evaluation_v2_exclusions.py b/nucleus/evaluation_v2_exclusions.py index c529f2e1..6669256e 100644 --- a/nucleus/evaluation_v2_exclusions.py +++ b/nucleus/evaluation_v2_exclusions.py @@ -1,6 +1,5 @@ """Exclusion rules for Evaluation V2 creation. -Mirrors ``packages/shared/src/nucleus/evaluationV2Exclusions.ts`` on the backend. These rules drop items/annotations from an evaluation before metrics are computed. The per-rule shape is validated server-side at create time diff --git a/nucleus/evaluation_v2_preset.py b/nucleus/evaluation_v2_preset.py index 36bf3630..09173ab8 100644 --- a/nucleus/evaluation_v2_preset.py +++ b/nucleus/evaluation_v2_preset.py @@ -4,7 +4,6 @@ so the same configuration can be applied across many evaluations. Presets are private to the creating user. -Mirrors the ``/v1/nucleus/evaluationV2Presets`` REST endpoints on the backend. Create and manage presets via :class:`~nucleus.NucleusClient`:: preset = client.create_evaluation_v2_preset( From 81b13811a0b7e770facc2eed508a8583f350cd4f Mon Sep 17 00:00:00 2001 From: Luke Schaefer Date: Thu, 25 Jun 2026 16:55:06 -0500 Subject: [PATCH 3/5] greptile --- nucleus/evaluation_v2_preset.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nucleus/evaluation_v2_preset.py b/nucleus/evaluation_v2_preset.py index 09173ab8..87258831 100644 --- a/nucleus/evaluation_v2_preset.py +++ b/nucleus/evaluation_v2_preset.py @@ -64,8 +64,13 @@ def from_json( name=str(payload["name"]), allowed_label_matches=parse_allowed_label_matches( payload.get("allowed_label_matches") + or payload.get("allowedLabelMatches") + ), + exclusion_rules=_parse_json_field( + payload.get("exclusion_rules") + if payload.get("exclusion_rules") is not None + else payload.get("exclusionRules") ), - exclusion_rules=_parse_json_field(payload.get("exclusion_rules")), created_by_user_id=payload.get("created_by_user_id"), created_at=payload.get("created_at"), updated_at=payload.get("updated_at"), From 3b73b6c69ab7b2dc8ea626aedf977d0363337a63 Mon Sep 17 00:00:00 2001 From: Luke Schaefer Date: Thu, 25 Jun 2026 17:36:38 -0500 Subject: [PATCH 4/5] remove api doc add --- docs/index.rst | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 88ec8c3d..cb310bab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,36 +12,6 @@ Scale Nucleus helps you: Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios. -.. _evaluations-v2: - -Evaluations V2 --------------- - -Evaluation V2 measures how well a **model run** matches ground-truth annotations. -Create a run with :meth:`NucleusClient.create_evaluation_v2`, wait with -:meth:`nucleus.evaluation_v2.EvaluationV2.wait_for_completion`, then read summary metrics with -:meth:`nucleus.evaluation_v2.EvaluationV2.charts` or individual matches with -:meth:`nucleus.evaluation_v2.EvaluationV2.examples`. - -.. code-block:: python - - import nucleus - - client = nucleus.NucleusClient(api_key="YOUR_API_KEY") - evaluation = client.create_evaluation_v2( - model_run_id="run_xxx", - name="my-eval", - allowed_label_matches=[ - nucleus.AllowedLabelMatch( - ground_truth_label="car", - model_prediction_label="vehicle", - ), - ], - ) - evaluation.wait_for_completion() - charts = evaluation.charts(iou_threshold=0.5) - fps = evaluation.examples(match_type="FP", limit=20) - .. _installation: Installation @@ -56,8 +26,8 @@ To use Nucleus, first install it using `pip`: .. _api: -Sections --------- +API Reference +------------- .. toctree:: :maxdepth: 4 From 7cc33e79732864f64f0eaad691d893ac5d5a460d Mon Sep 17 00:00:00 2001 From: Luke Schaefer Date: Thu, 25 Jun 2026 17:41:34 -0500 Subject: [PATCH 5/5] greptile --- nucleus/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nucleus/__init__.py b/nucleus/__init__.py index 8e098786..efbc2261 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -958,7 +958,7 @@ def create_evaluation_v2( :class:`EvaluationV2`: The created evaluation. """ if preset is not None: - if allowed_label_matches is None: + if allowed_label_matches is None and allowed_label_matches_id is None: allowed_label_matches = preset.allowed_label_matches if exclusion_rules is None and preset.exclusion_rules is not None: exclusion_rules = list(preset.exclusion_rules)