Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,11 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD
- `--compression`
- Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. The source compression is auto-detected from the file extension. Example: `--compression gz` converts all downloaded compressed files to gzip format.
- `--format`
- Enables on-the-fly RDF and tabular format conversion during download (Layer 2). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Example: `--format turtle` converts all downloaded RDF triple files to Turtle format.
- Enables on-the-fly RDF and tabular format conversion during download (Layer 2 and Layer 3). Supported formats: `ntriples` (`nt`), `turtle` (`ttl`), `rdf-xml` (`rdf`, `xml`), `nquads` (`nq`), `trig`, `trix`, `json-ld` (`jsonld`), `csv`, `tsv`. Short aliases shown in brackets. Only the converted output file is kept — the original is deleted after successful conversion. Within the same equivalence class (e.g. turtle to ntriples) conversion is lossless. Across classes (e.g. RDF to CSV) some flags below may be required.
- `--graph-name`
- Required when converting RDF triples to a quad format (e.g. turtle to nquads). Assigns all triples to the specified named graph URI. Example: `--format nquads --graph-name https://example.org/mygraph`.
- `--base-uri`
- Required when converting CSV/TSV to RDF triples. Used as the base for constructing subject URIs from CSV row identifiers. Example: `--format ntriples --base-uri https://example.org/data/`.
- `--validate-checksum`
- Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted.

Expand Down Expand Up @@ -296,6 +300,24 @@ databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased
databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format ntriples --compression gz
```

**Download with Mapping Conversion (Layer 3)**: convert across format classes — between RDF triples, RDF quads, and tabular data.
```bash
# RDF Triples -> RDF Quads (requires --graph-name)
databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format nquads --graph-name https://example.org/mygraph

# RDF Quads -> RDF Triples (splits into one file per named graph, in a subdirectory)
databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.nq --format turtle

# RDF Triples -> CSV (produces a companion .meta.json preserving datatypes/language tags)
databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 --format csv

# CSV -> RDF Triples (requires --base-uri; lossless if companion .meta.json is present)
databusclient download https://databus.dbpedia.org/dbpedia/some-tabular-dataset/2022.12.01/data.csv --format ntriples --base-uri https://example.org/data/

# RDF Quads -> CSV (adds a 'graph' column)
databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.nq --format csv
```

<a id="cli-deploy"></a>
### Deploy

Expand Down
125 changes: 115 additions & 10 deletions databusclient/api/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
get_databus_id_parts_from_file_url,
compute_sha256_and_length,
)
from databusclient.filehandling.format import convert_file, get_converted_filename
from databusclient.filehandling.format import (
convert_file,
get_converted_filename,
normalize_format,
get_format_class,
detect_format_from_filename,
FORMAT_TO_EXTENSION,
)

# Compression format mappings
COMPRESSION_EXTENSIONS = {
Expand Down Expand Up @@ -316,6 +323,8 @@ def _download_file(
client_id=None,
compression=None,
convert_format=None,
graph_name=None,
base_uri=None,
validate_checksum: bool = False,
expected_checksum: str | None = None,
) -> None:
Expand All @@ -331,6 +340,8 @@ def _download_file(
compression: Target compression format for on-the-fly conversion.
Source compression is auto-detected from the file extension.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
expected_checksum: The expected checksum of the file.
"""
Expand Down Expand Up @@ -555,10 +566,6 @@ def _download_file(
# already matches target format, skip decompression and conversion
# entirely — no work needed for the format part.
if needs_format_conversion and source_compression is not None:
from databusclient.filehandling.format import (
detect_format_from_filename,
normalize_format,
)
detected_input_format = detect_format_from_filename(file)
normalized_target = normalize_format(convert_format)
if detected_input_format == normalized_target:
Expand Down Expand Up @@ -599,10 +606,57 @@ def _download_file(

conversion_input_path = temp_decompressed_path

# Convert format on uncompressed input.
# Determine whether this is a Quad -> Triple (Layer 3) conversion.
# This direction produces multiple output files (one per named
# graph) written into a subdirectory, rather than a single file —
# so it is handled separately from the standard single-file path
# below (no recompression, no single-file delete-and-replace).
normalized_convert_format = normalize_format(convert_format)
target_class = get_format_class(normalized_convert_format)
source_format_for_mapping = detect_format_from_filename(conversion_input_path)
source_class_for_mapping = (
get_format_class(source_format_for_mapping)
if source_format_for_mapping else None
)
is_quad_to_triple = (
source_class_for_mapping == "quads" and target_class == "triples"
)

if is_quad_to_triple:
# Output directory name = original filename with compression and
# format extensions stripped (e.g. "data.nq.gz" -> "data").
output_stem = get_converted_filename(file, convert_format)
target_ext = FORMAT_TO_EXTENSION.get(normalized_convert_format, "")
if target_ext and output_stem.lower().endswith(target_ext):
output_stem = output_stem[: -len(target_ext)]
output_dir = os.path.join(localDir, output_stem)

convert_file(
conversion_input_path,
output_dir,
convert_format,
graph_name=graph_name,
base_uri=base_uri,
)

# Delete the original downloaded (possibly compressed) file —
# the split output directory replaces it.
if os.path.exists(filename):
os.remove(filename)
print(f"Removed original file: {os.path.basename(filename)}")
return

# Standard single-output-file path (Layer 2, and the remaining
# Layer 3 directions: Triple<->Quad, Triple<->TSD, Quad->TSD).
converted_basename = get_converted_filename(file, convert_format)
converted_uncompressed_path = os.path.join(localDir, converted_basename)
convert_file(conversion_input_path, converted_uncompressed_path, convert_format)
convert_file(
conversion_input_path,
converted_uncompressed_path,
convert_format,
graph_name=graph_name,
base_uri=base_uri,
)

# Delete the original downloaded file after successful format conversion,
# unless the converted output is the same file (same format, same path).
Expand All @@ -612,12 +666,18 @@ def _download_file(
print(f"Removed original file: {os.path.basename(filename)}")

# Recompress converted output when needed.
# Three cases:
# 1. Source was compressed + --compression given -> use target compression
# 2. Source was compressed, no --compression given -> recompress with original
# 3. Source was NOT compressed + --compression given -> compress the output
# 4. Source was NOT compressed, no --compression given -> no compression
if source_compression is not None:
if should_convert_compression and compression:
final_compression = compression
else:
final_compression = source_compression
elif should_convert_compression and compression:
elif compression:
# Source was uncompressed but user explicitly requested --compression
final_compression = compression
else:
final_compression = None
Expand Down Expand Up @@ -651,6 +711,8 @@ def _download_files(
client_id: str = None,
compression: str = None,
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
validate_checksum: bool = False,
checksums: dict | None = None,
) -> None:
Expand All @@ -665,6 +727,8 @@ def _download_files(
client_id: Client ID for token exchange.
compression: Target compression format for on-the-fly conversion.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
checksums: Dictionary mapping URLs to their expected checksums.
"""
Expand All @@ -681,11 +745,12 @@ def _download_files(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
expected_checksum=expected,
)


def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str:
"""Get SPARQL query of collection members from databus collection URI.

Expand Down Expand Up @@ -829,6 +894,8 @@ def _download_collection(
client_id: str = None,
compression: str = None,
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
validate_checksum: bool = False,
) -> None:
"""Download all files in a databus collection.
Expand All @@ -843,6 +910,8 @@ def _download_collection(
client_id: Client ID for token exchange.
compression: Target compression format for on-the-fly conversion.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
"""
query = _get_sparql_query_of_collection(uri, databus_key=databus_key)
Expand All @@ -864,6 +933,8 @@ def _download_collection(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
checksums=checksums if checksums else None,
)
Expand All @@ -878,6 +949,8 @@ def _download_version(
client_id: str = None,
compression: str = None,
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
validate_checksum: bool = False,
) -> None:
"""Download all files in a databus artifact version.
Expand All @@ -891,6 +964,8 @@ def _download_version(
client_id: Client ID for token exchange.
compression: Target compression format for on-the-fly conversion.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
Expand All @@ -911,6 +986,8 @@ def _download_version(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
checksums=checksums,
)
Expand All @@ -926,6 +1003,8 @@ def _download_artifact(
client_id: str = None,
compression: str = None,
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
validate_checksum: bool = False,
) -> None:
"""Download files in a databus artifact.
Expand All @@ -940,6 +1019,8 @@ def _download_artifact(
client_id: Client ID for token exchange.
compression: Target compression format for on-the-fly conversion.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
Expand All @@ -966,6 +1047,8 @@ def _download_artifact(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
checksums=checksums,
)
Expand Down Expand Up @@ -1042,6 +1125,8 @@ def _download_group(
client_id: str = None,
compression: str = None,
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
validate_checksum: bool = False,
) -> None:
"""Download files in a databus group.
Expand All @@ -1056,6 +1141,8 @@ def _download_group(
client_id: Client ID for token exchange.
compression: Target compression format for on-the-fly conversion.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
Expand All @@ -1072,6 +1159,8 @@ def _download_group(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
)

Expand Down Expand Up @@ -1121,6 +1210,8 @@ def download(
client_id="vault-token-exchange",
compression=None,
convert_format=None,
graph_name=None,
base_uri=None,
validate_checksum: bool = False,
) -> None:
"""Download datasets from databus.
Expand All @@ -1136,8 +1227,10 @@ def download(
auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token".
client_id: Client ID for token exchange. Default is "vault-token-exchange".
compression: Target compression format for on-the-fly conversion (supported: bz2, gz, xz).
Source compression is auto-detected from the file extension.
Source compression is auto-detected from the file extension.
convert_format: Target RDF/tabular format for on-the-fly conversion.
graph_name: Named graph URI for Triple -> Quad conversion (Layer 3).
base_uri: Base URI for CSV -> Triple conversion (Layer 3).
validate_checksum: Whether to validate checksums after downloading.
"""
for databusURI in databusURIs:
Expand Down Expand Up @@ -1167,6 +1260,8 @@ def download(
client_id,
compression,
convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
)
elif file is not None:
Expand All @@ -1188,6 +1283,8 @@ def download(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
expected_checksum=expected,
)
Expand All @@ -1202,6 +1299,8 @@ def download(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
)
elif artifact is not None:
Expand All @@ -1218,6 +1317,8 @@ def download(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
)
elif group is not None and group != "collections":
Expand All @@ -1234,6 +1335,8 @@ def download(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
)
elif account is not None:
Expand Down Expand Up @@ -1272,6 +1375,8 @@ def download(
client_id=client_id,
compression=compression,
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
validate_checksum=validate_checksum,
checksums=checksums if checksums else None,
)
Loading
Loading