From 116dbaf29760886e7f1e3c23f9a8f397c2686256 Mon Sep 17 00:00:00 2001 From: jiatolentino Date: Wed, 24 Jun 2026 12:53:01 +0800 Subject: [PATCH 1/4] fix: make DiscoveryMatch.label optional for non-sensitive/ignore matches A non-sensitive or ignore discovery match carries no sensitivity label, but DiscoveryMatch required `label: str`, so parsing schema-discovery results that contained such a match (e.g. a scope.non_sensitive column, hit via the generate_ruleset path) raised a pydantic ValidationError. The schema_discovery CSV path never parsed the model, so this was latent until MongoDB document discovery exercised it. --- datamasque/client/models/discovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 7bccefa..4c35ae0 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -252,7 +252,7 @@ class DiscoveryMatch(BaseModel): model_config = ConfigDict(extra="allow") - label: str + label: Optional[str] = None categories: list[str] flagged_by: str description: str From 7be03cd9982d7ae70b5c4c8436e95ff3b46289c3 Mon Sep 17 00:00:00 2001 From: Colin Haywood Date: Thu, 25 Jun 2026 09:56:36 +1200 Subject: [PATCH 2/4] feat: Add finished_with_warnings RG status Also fix comment re ignored matches - ignored matches don't get returned by the server at all --- datamasque/client/models/discovery.py | 4 ++-- datamasque/client/models/status.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/datamasque/client/models/discovery.py b/datamasque/client/models/discovery.py index 4c35ae0..23a5a88 100644 --- a/datamasque/client/models/discovery.py +++ b/datamasque/client/models/discovery.py @@ -343,8 +343,8 @@ class FileDiscoveryMatch(BaseModel): flagged_by: str description: str - label: Optional[str] = None # Omitted for non-sensitive and ignored matches. - categories: Optional[list[str]] = None # Omitted for ignored matches. + label: Optional[str] = None # Omitted for non-sensitive matches. + categories: Optional[list[str]] = None hit_ratio: Optional[int] = None # None for metadata matches, percentage 0-100 for IDD matches. diff --git a/datamasque/client/models/status.py b/datamasque/client/models/status.py index f470fd0..93871e8 100644 --- a/datamasque/client/models/status.py +++ b/datamasque/client/models/status.py @@ -60,6 +60,7 @@ class AsyncRulesetGenerationTaskStatus(enum.Enum): """List of statuses of async ruleset generation tasks.""" finished = "finished" + finished_with_warnings = "finished_with_warnings" failed = "failed" running = "running" queued = "queued" From ed6458e6aed1dc011a2c4c8f3f069e6e6377e501 Mon Sep 17 00:00:00 2001 From: jiatolentino Date: Tue, 16 Jun 2026 12:09:54 +0800 Subject: [PATCH 3/4] feat: return split db-discovery reports as zip bytes --- datamasque/client/discovery.py | 16 +++++++++++++-- datamasque/client/runs.py | 11 +++++++++-- tests/test_discovery.py | 36 ++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/datamasque/client/discovery.py b/datamasque/client/discovery.py index f6f3850..69862eb 100644 --- a/datamasque/client/discovery.py +++ b/datamasque/client/discovery.py @@ -98,6 +98,10 @@ def start_async_ruleset_generation_from_csv( - A text file handle (e.g. `open(path)`) - A binary file handle (e.g. `open(path, 'rb')`) + If the content is a zip (for example a split report from `get_db_discovery_result_report()`), + it is detected by its magic bytes and uploaded as a zip; + otherwise it is uploaded as CSV. + Generation runs asynchronously on the server. Poll `get_async_ruleset_generation_task_status` until it returns `AsyncRulesetGenerationTaskStatus.finished`, @@ -114,14 +118,22 @@ def start_async_ruleset_generation_from_csv( else: content = csv_content + is_zip = False + if content.seekable(): + is_zip = content.read(4) == b"PK\x03\x04" + content.seek(0) + filename = "ruleset.zip" if is_zip else "ruleset.csv" + content_type = "application/zip" if is_zip else "text/csv" + files = [ UploadFile( field_name="csv_or_zip_file", - filename="ruleset.csv", + filename=filename, content=content, - content_type="text/csv", + content_type=content_type, ), ] + self.make_request( method="POST", path=f"/api/async-generate-ruleset/{connection_id}/from-csv/", diff --git a/datamasque/client/runs.py b/datamasque/client/runs.py index 6b9a827..9f4c87c 100644 --- a/datamasque/client/runs.py +++ b/datamasque/client/runs.py @@ -1,5 +1,6 @@ import logging import re +from typing import Union from datamasque.client.base import BaseClient from datamasque.client.exceptions import ( @@ -43,9 +44,12 @@ def get_run_report(self, run_id: RunId) -> str: response = self.make_request("GET", f"api/runs/{run_id}/run-report/") return response.text - def get_db_discovery_result_report(self, run_id: RunId, include_selection_column: bool = True) -> str: + def get_db_discovery_result_report(self, run_id: RunId, include_selection_column: bool = True) -> Union[str, bytes]: """ - Returns the database-discovery result report for the specified run as CSV. + Returns the database-discovery result report for the specified run. + + Returns CSV text (`str`), + or a zip of numbered CSV parts as `bytes` when the server splits a large report. When `include_selection_column` is true (the default), the CSV includes a `selected` column suitable for feeding back into ruleset generation. @@ -54,6 +58,9 @@ def get_db_discovery_result_report(self, run_id: RunId, include_selection_column url = f"api/runs/{run_id}/db-discovery-results/report/" params = None if include_selection_column else {"include_selection_column": "false"} response = self.make_request("GET", url, params=params) + + if response.headers.get("Content-Type", "").startswith("application/zip"): + return response.content return response.text def get_unfinished_runs(self) -> dict[str, UnfinishedRun]: diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 42debfd..d954045 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -108,6 +108,17 @@ def test_get_db_discovery_result_report(client): assert result == "db discovery report without selection column" +def test_get_db_discovery_result_report_returns_zip_bytes_when_split(client): + run_id = RunId(1) + zip_bytes = b"PK\x03\x04 split report zip bytes" + with requests_mock.Mocker() as m: + url = f"http://test-server/api/runs/{run_id}/db-discovery-results/report/" + m.get(url, content=zip_bytes, headers={"Content-Type": "application/zip"}, status_code=200) + result = client.get_db_discovery_result_report(run_id) + assert result == zip_bytes + assert isinstance(result, bytes) + + def test_poll_async_ruleset_generation(client): connection_id = ConnectionId("1") with requests_mock.Mocker() as m: @@ -463,6 +474,31 @@ def test_start_async_ruleset_generation_from_csv_success(client, csv_content): assert form_data["csv_or_zip_file"]["content"] == b"schema,table,column,selected\npublic,users,email,true" +@pytest.mark.parametrize( + "zip_content", + [ + b"PK\x03\x04 zipped discovery report", + BytesIO(b"PK\x03\x04 zipped discovery report"), + ], + ids=["bytes", "BytesIO"], +) +def test_start_async_ruleset_generation_from_csv_uploads_zip_as_zip(client, zip_content): + """A split report is uploaded with a .zip filename and zip content-type, whether passed as bytes or a binary stream.""" + connection_id = ConnectionId("1") + + with requests_mock.Mocker() as m: + m.post( + f"http://test-server/api/async-generate-ruleset/{connection_id}/from-csv/", + status_code=201, + ) + client.start_async_ruleset_generation_from_csv(connection_id, zip_content) + + form_data = parse_multipart_form(m.last_request) + assert form_data["csv_or_zip_file"]["filename"] == "ruleset.zip" + assert form_data["csv_or_zip_file"]["content_type"] == "application/zip" + assert form_data["csv_or_zip_file"]["content"] == b"PK\x03\x04 zipped discovery report" + + def test_start_async_ruleset_generation_from_csv_with_target_size(client): """Test async ruleset generation from CSV with target_size_bytes parameter.""" connection_id = ConnectionId("1") From 1d65dca1556c5dfbf87ba64a2453db174d6cd1cd Mon Sep 17 00:00:00 2001 From: jiatolentino Date: Thu, 25 Jun 2026 06:01:25 +0800 Subject: [PATCH 4/4] chore: release 1.1.1 --- HISTORY.rst | 9 +++++++++ pyproject.toml | 2 +- setup.cfg | 2 +- uv.lock | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index dc72f9a..cbfc4d4 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,15 @@ History ======= +1.1.1 (2026-06-25) +------------------ + +* Made ``DiscoveryMatch.label`` optional (it is absent for non-sensitive/ignore matches). +* Added the ``finished_with_warnings`` status to ``AsyncRulesetGenerationTaskStatus``. +* ``get_db_discovery_result_report`` may now return ``bytes`` (a zip) + when the server splits a large DB-discovery report, + and ruleset generation from CSV now detects and forwards zip uploads. + 1.1.0 (2026-06-24) ------------------ diff --git a/pyproject.toml b/pyproject.toml index 1d226e4..eca07af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "datamasque-python" -version = "1.1.0" +version = "1.1.1" description = "Official Python client for the DataMasque data-masking API." authors = [ { name = "DataMasque Ltd" }, diff --git a/setup.cfg b/setup.cfg index 26b71a7..1022f00 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.1.0 +current_version = 1.1.1 commit = True tag = True diff --git a/uv.lock b/uv.lock index 31e21ef..777b461 100644 --- a/uv.lock +++ b/uv.lock @@ -428,7 +428,7 @@ toml = [ [[package]] name = "datamasque-python" -version = "1.1.0.dev0" +version = "1.1.1" source = { editable = "." } dependencies = [ { name = "pydantic" },