Skip to content

Commit 138a10c

Browse files
feat: Improve content variant handling in distribution parsing
1 parent b2c3f1c commit 138a10c

4 files changed

Lines changed: 228 additions & 13 deletions

File tree

databusclient/api/deploy.py

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,41 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
173173
return cvs, format_extension, compression, sha256sum, content_length
174174

175175

176+
def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
177+
"""
178+
Extract file info from a pre-parsed distribution dictionary.
179+
180+
Parameters
181+
----------
182+
dist_dict : dict
183+
A dictionary with keys: url, variants, formatExtension, compression
184+
(as returned by parse_distribution_str in cli.py)
185+
186+
Returns
187+
-------
188+
Tuple containing:
189+
- cvs: Dict of content variants
190+
- format_extension: File format extension
191+
- compression: Compression type
192+
- sha256sum: SHA-256 hash of file
193+
- content_length: File size in bytes
194+
"""
195+
url = dist_dict.get("url", "")
196+
cvs = dist_dict.get("variants", {})
197+
format_extension = dist_dict.get("formatExtension") or "file"
198+
compression = dist_dict.get("compression") or "none"
199+
200+
# Check if sha256sum and content_length are provided
201+
sha256sum = dist_dict.get("sha256sum")
202+
content_length = dist_dict.get("byteSize")
203+
204+
# If not provided, load from URL
205+
if sha256sum is None or content_length is None:
206+
sha256sum, content_length = _load_file_stats(url)
207+
208+
return cvs, format_extension, compression, sha256sum, content_length
209+
210+
176211
def create_distribution(
177212
url: str,
178213
cvs: Dict[str, str],
@@ -272,7 +307,7 @@ def create_dataset(
272307
abstract: str,
273308
description: str,
274309
license_url: str,
275-
distributions: List[str],
310+
distributions: Union[List[str], List[Dict]],
276311
attribution: str = None,
277312
derived_from: str = None,
278313
group_title: str = None,
@@ -296,8 +331,10 @@ def create_dataset(
296331
A long description of the dataset. Markdown syntax is supported
297332
license_url: str
298333
The license of the dataset as a URI.
299-
distributions: str
300-
Distribution information string as it is in the CLI. Can be created by running the create_distribution function
334+
distributions: Union[List[str], List[Dict]]
335+
Distribution information. Can be either:
336+
- List[str]: Legacy format with pipe-separated strings (created by create_distribution function)
337+
- List[Dict]: Pre-parsed dictionaries with keys: url, variants, formatExtension, compression
301338
attribution: str
302339
OPTIONAL! The attribution information for the Dataset
303340
derived_from: str
@@ -326,15 +363,28 @@ def create_dataset(
326363
artifact_id = _versionId.rsplit("/", 1)[0]
327364

328365
distribution_list = []
329-
for dst_string in distributions:
330-
__url = str(dst_string).split("|")[0]
331-
(
332-
cvs,
333-
formatExtension,
334-
compression,
335-
sha256sum,
336-
content_length,
337-
) = get_file_info(dst_string)
366+
for dst in distributions:
367+
# Check if distribution is a pre-parsed dict or a legacy string
368+
if isinstance(dst, dict):
369+
# New format: pre-parsed dictionary from parse_distribution_str()
370+
__url = dst.get("url", "")
371+
(
372+
cvs,
373+
formatExtension,
374+
compression,
375+
sha256sum,
376+
content_length,
377+
) = _get_file_info_from_dict(dst)
378+
else:
379+
# Legacy format: pipe-separated string
380+
__url = str(dst).split("|")[0]
381+
(
382+
cvs,
383+
formatExtension,
384+
compression,
385+
sha256sum,
386+
content_length,
387+
) = get_file_info(dst)
338388

339389
if not cvs and len(distributions) > 1:
340390
raise BadArgumentException(

databusclient/api/queries.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
SPARQL Queries for Databus Python Client
3+
4+
This module contains SPARQL queries used for interacting with the DBpedia Databus.
5+
"""
6+
7+
# Query to fetch ontologies with proper content variant aggregation
8+
# Uses GROUP_CONCAT to handle multiple content variants per distribution
9+
ONTOLOGIES_QUERY = """
10+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
11+
PREFIX databus: <https://databus.dbpedia.org/>
12+
PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
13+
PREFIX dataid-cv: <http://dataid.dbpedia.org/ns/cv#>
14+
PREFIX dct: <http://purl.org/dc/terms/>
15+
PREFIX dcat: <http://www.w3.org/ns/dcat#>
16+
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
17+
18+
SELECT DISTINCT
19+
?group ?art ?version ?title ?publisher ?comment ?description
20+
?license ?file ?extension ?type ?bytes ?shasum
21+
(GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants)
22+
WHERE {
23+
?dataset dataid:account databus:ontologies .
24+
?dataset dataid:group ?group .
25+
?dataset dataid:artifact ?art.
26+
?dataset dcat:distribution ?distribution .
27+
?dataset dct:license ?license .
28+
?dataset dct:publisher ?publisher .
29+
?dataset rdfs:comment ?comment .
30+
?dataset dct:description ?description .
31+
?dataset dct:title ?title .
32+
?distribution dcat:downloadURL ?file .
33+
?distribution dataid:formatExtension ?extension .
34+
?distribution dataid-cv:type ?type .
35+
?distribution dcat:byteSize ?bytes .
36+
?distribution dataid:sha256sum ?shasum .
37+
?dataset dct:hasVersion ?version .
38+
39+
# Excludes dev versions
40+
FILTER (!regex(?art, "--DEV"))
41+
42+
# OPTIONAL: Check for variants, but don't fail if none exist
43+
OPTIONAL {
44+
?distribution dataid:contentVariant ?cv .
45+
BIND(STR(?cv) AS ?variantStr)
46+
}
47+
48+
}
49+
GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum
50+
ORDER BY ?version
51+
"""
52+
53+
54+
def parse_content_variants_string(variants_str: str) -> dict:
55+
"""
56+
Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result.
57+
58+
Parameters
59+
----------
60+
variants_str : str
61+
Comma-separated string of content variants, e.g., "lang=en, type=full, sorted=true"
62+
63+
Returns
64+
-------
65+
dict
66+
Dictionary of key-value pairs, e.g., {"lang": "en", "type": "full", "sorted": "true"}
67+
"""
68+
if not variants_str or variants_str.strip() == "":
69+
return {}
70+
71+
variants = {}
72+
for part in variants_str.split(","):
73+
part = part.strip()
74+
if "=" in part:
75+
key, value = part.split("=", 1)
76+
variants[key.strip()] = value.strip()
77+
elif part:
78+
# Handle standalone values (no key=value format)
79+
variants[part] = True
80+
81+
return variants

databusclient/cli.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22
import json
33
import os
4+
import re
45
from typing import List
56

67
import click
@@ -11,6 +12,51 @@
1112
from databusclient.extensions import webdav
1213

1314

15+
def parse_distribution_str(dist_str: str):
16+
"""
17+
Parses a distribution string with format:
18+
URL|key=value|...|.extension
19+
20+
Returns a dictionary suitable for the deploy API.
21+
"""
22+
parts = dist_str.split('|')
23+
url = parts[0].strip()
24+
25+
variants = {}
26+
format_ext = None
27+
compression = None
28+
29+
# Iterate over the modifiers (everything after the URL)
30+
for part in parts[1:]:
31+
part = part.strip()
32+
33+
# Case 1: Extension (starts with .)
34+
if part.startswith('.'):
35+
# purely heuristic: if it looks like compression (gz, zip, br), treat as compression
36+
# otherwise treat as format extension
37+
if part.lower() in ['.gz', '.zip', '.br', '.tar', '.zst']:
38+
compression = part.lstrip('.') # remove leading dot for API compatibility if needed
39+
else:
40+
format_ext = part.lstrip('.')
41+
42+
# Case 2: Content Variant (key=value)
43+
elif '=' in part:
44+
key, value = part.split('=', 1)
45+
variants[key.strip()] = value.strip()
46+
47+
# Case 3: Standalone tag (treat as boolean variant or ignore?
48+
# For now, we assume it's a value for a default key or warn)
49+
else:
50+
print(f"WARNING: Unrecognized modifier '{part}' in distribution. Expected '.ext' or 'key=val'.")
51+
52+
return {
53+
"url": url,
54+
"variants": variants,
55+
"formatExtension": format_ext,
56+
"compression": compression
57+
}
58+
59+
1460
@click.group()
1561
def app():
1662
"""Databus Client CLI"""
@@ -81,9 +127,16 @@ def deploy(
81127
click.echo("[MODE] Classic deploy with distributions")
82128
click.echo(f"Deploying dataset version: {version_id}")
83129

130+
# --- CHANGE START ---
131+
# Parse the input strings into structured objects
132+
parsed_distributions = [parse_distribution_str(d) for d in distributions]
133+
134+
# Note: api_deploy.create_dataset now accepts this list of dicts
84135
dataid = api_deploy.create_dataset(
85-
version_id, title, abstract, description, license_url, distributions
136+
version_id, title, abstract, description, license_url, parsed_distributions
86137
)
138+
# --- CHANGE END ---
139+
87140
api_deploy.deploy(dataid=dataid, api_key=apikey)
88141
return
89142

tests/test_deploy.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,41 @@
1111
_get_content_variants,
1212
BadArgumentException,
1313
)
14+
from databusclient.cli import parse_distribution_str
1415

1516
EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml"
1617

1718

19+
def test_parse_distribution_str():
20+
"""Test the new pipe-separated distribution string parser"""
21+
22+
# Test with multiple content variants
23+
result = parse_distribution_str("http://example.com/data|lang=en|type=full|sorted=true|.ttl|.gz")
24+
assert result["url"] == "http://example.com/data"
25+
assert result["variants"] == {"lang": "en", "type": "full", "sorted": "true"}
26+
assert result["formatExtension"] == "ttl"
27+
assert result["compression"] == "gz"
28+
29+
# Test with single content variant
30+
result = parse_distribution_str("http://mysite.com/data.json|lang=fr|.json")
31+
assert result["url"] == "http://mysite.com/data.json"
32+
assert result["variants"] == {"lang": "fr"}
33+
assert result["formatExtension"] == "json"
34+
assert result["compression"] is None
35+
36+
# Test URL only
37+
result = parse_distribution_str("http://example.com/file.csv")
38+
assert result["url"] == "http://example.com/file.csv"
39+
assert result["variants"] == {}
40+
assert result["formatExtension"] is None
41+
assert result["compression"] is None
42+
43+
# Test with compression only (no format extension)
44+
result = parse_distribution_str("http://example.com/data|.gz")
45+
assert result["url"] == "http://example.com/data"
46+
assert result["compression"] == "gz"
47+
48+
1849
def test_get_content_variants():
1950
# With content variants
2051
cvs = _get_content_variants(

0 commit comments

Comments
 (0)