feat: Improve content variant handling in distribution parsing

vaibhav45sktech · vaibhav45sktech · commit 138a10cedbe1 · 2026-01-03T02:05:29.000+05:30
diff --git a/databusclient/api/deploy.py b/databusclient/api/deploy.py
@@ -173,6 +173,41 @@ def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str,
     return cvs, format_extension, compression, sha256sum, content_length
 
 
+def _get_file_info_from_dict(dist_dict: Dict[str, any]) -> Tuple[Dict[str, str], str, str, str, int]:
+    """
+    Extract file info from a pre-parsed distribution dictionary.
+    
+    Parameters
+    ----------
+    dist_dict : dict
+        A dictionary with keys: url, variants, formatExtension, compression
+        (as returned by parse_distribution_str in cli.py)
+    
+    Returns
+    -------
+    Tuple containing:
+        - cvs: Dict of content variants
+        - format_extension: File format extension
+        - compression: Compression type
+        - sha256sum: SHA-256 hash of file
+        - content_length: File size in bytes
+    """
+    url = dist_dict.get("url", "")
+    cvs = dist_dict.get("variants", {})
+    format_extension = dist_dict.get("formatExtension") or "file"
+    compression = dist_dict.get("compression") or "none"
+    
+    # Check if sha256sum and content_length are provided
+    sha256sum = dist_dict.get("sha256sum")
+    content_length = dist_dict.get("byteSize")
+    
+    # If not provided, load from URL
+    if sha256sum is None or content_length is None:
+        sha256sum, content_length = _load_file_stats(url)
+    
+    return cvs, format_extension, compression, sha256sum, content_length
+
+
 def create_distribution(
     url: str,
     cvs: Dict[str, str],
@@ -272,7 +307,7 @@ def create_dataset(
     abstract: str,
     description: str,
     license_url: str,
-    distributions: List[str],
+    distributions: Union[List[str], List[Dict]],
     attribution: str = None,
     derived_from: str = None,
     group_title: str = None,
@@ -296,8 +331,10 @@ def create_dataset(
         A long description of the dataset. Markdown syntax is supported
     license_url: str
         The license of the dataset as a URI.
-    distributions: str
-        Distribution information string as it is in the CLI. Can be created by running the create_distribution function
+    distributions: Union[List[str], List[Dict]]
+        Distribution information. Can be either:
+        - List[str]: Legacy format with pipe-separated strings (created by create_distribution function)
+        - List[Dict]: Pre-parsed dictionaries with keys: url, variants, formatExtension, compression
     attribution: str
         OPTIONAL! The attribution information for the Dataset
     derived_from: str
@@ -326,15 +363,28 @@ def create_dataset(
     artifact_id = _versionId.rsplit("/", 1)[0]
 
     distribution_list = []
-    for dst_string in distributions:
-        __url = str(dst_string).split("|")[0]
-        (
-            cvs,
-            formatExtension,
-            compression,
-            sha256sum,
-            content_length,
-        ) = get_file_info(dst_string)
+    for dst in distributions:
+        # Check if distribution is a pre-parsed dict or a legacy string
+        if isinstance(dst, dict):
+            # New format: pre-parsed dictionary from parse_distribution_str()
+            __url = dst.get("url", "")
+            (
+                cvs,
+                formatExtension,
+                compression,
+                sha256sum,
+                content_length,
+            ) = _get_file_info_from_dict(dst)
+        else:
+            # Legacy format: pipe-separated string
+            __url = str(dst).split("|")[0]
+            (
+                cvs,
+                formatExtension,
+                compression,
+                sha256sum,
+                content_length,
+            ) = get_file_info(dst)
 
         if not cvs and len(distributions) > 1:
             raise BadArgumentException(
diff --git a/databusclient/api/queries.py b/databusclient/api/queries.py
@@ -0,0 +1,81 @@
+"""
+SPARQL Queries for Databus Python Client
+
+This module contains SPARQL queries used for interacting with the DBpedia Databus.
+"""
+
+# Query to fetch ontologies with proper content variant aggregation
+# Uses GROUP_CONCAT to handle multiple content variants per distribution
+ONTOLOGIES_QUERY = """
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX databus: <https://databus.dbpedia.org/>
+PREFIX dataid: <http://dataid.dbpedia.org/ns/core#>
+PREFIX dataid-cv: <http://dataid.dbpedia.org/ns/cv#>
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
+
+SELECT DISTINCT 
+  ?group ?art ?version ?title ?publisher ?comment ?description 
+  ?license ?file ?extension ?type ?bytes ?shasum 
+  (GROUP_CONCAT(DISTINCT ?variantStr; separator=", ") AS ?contentVariants) 
+WHERE { 
+  ?dataset dataid:account databus:ontologies .
+  ?dataset dataid:group ?group .
+  ?dataset dataid:artifact ?art.
+  ?dataset dcat:distribution ?distribution .
+  ?dataset dct:license ?license .
+  ?dataset dct:publisher ?publisher .
+  ?dataset rdfs:comment ?comment .
+  ?dataset dct:description ?description .
+  ?dataset dct:title ?title .
+  ?distribution dcat:downloadURL ?file .
+  ?distribution dataid:formatExtension ?extension .
+  ?distribution dataid-cv:type ?type .
+  ?distribution dcat:byteSize ?bytes .
+  ?distribution dataid:sha256sum ?shasum .
+  ?dataset dct:hasVersion ?version .
+
+  # Excludes dev versions
+  FILTER (!regex(?art, "--DEV"))
+
+  # OPTIONAL: Check for variants, but don't fail if none exist
+  OPTIONAL { 
+    ?distribution dataid:contentVariant ?cv . 
+    BIND(STR(?cv) AS ?variantStr)
+  }
+
+} 
+GROUP BY ?group ?art ?version ?title ?publisher ?comment ?description ?license ?file ?extension ?type ?bytes ?shasum 
+ORDER BY ?version
+"""
+
+
+def parse_content_variants_string(variants_str: str) -> dict:
+    """
+    Parse a comma-separated content variants string from SPARQL GROUP_CONCAT result.
+    
+    Parameters
+    ----------
+    variants_str : str
+        Comma-separated string of content variants, e.g., "lang=en, type=full, sorted=true"
+    
+    Returns
+    -------
+    dict
+        Dictionary of key-value pairs, e.g., {"lang": "en", "type": "full", "sorted": "true"}
+    """
+    if not variants_str or variants_str.strip() == "":
+        return {}
+    
+    variants = {}
+    for part in variants_str.split(","):
+        part = part.strip()
+        if "=" in part:
+            key, value = part.split("=", 1)
+            variants[key.strip()] = value.strip()
+        elif part:
+            # Handle standalone values (no key=value format)
+            variants[part] = True
+    
+    return variants
diff --git a/databusclient/cli.py b/databusclient/cli.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import json
 import os
+import re
 from typing import List
 
 import click
@@ -11,6 +12,51 @@
 from databusclient.extensions import webdav
 
 
+def parse_distribution_str(dist_str: str):
+    """
+    Parses a distribution string with format:
+    URL|key=value|...|.extension
+    
+    Returns a dictionary suitable for the deploy API.
+    """
+    parts = dist_str.split('|')
+    url = parts[0].strip()
+    
+    variants = {}
+    format_ext = None
+    compression = None
+    
+    # Iterate over the modifiers (everything after the URL)
+    for part in parts[1:]:
+        part = part.strip()
+        
+        # Case 1: Extension (starts with .)
+        if part.startswith('.'):
+            # purely heuristic: if it looks like compression (gz, zip, br), treat as compression
+            # otherwise treat as format extension
+            if part.lower() in ['.gz', '.zip', '.br', '.tar', '.zst']:
+                compression = part.lstrip('.') # remove leading dot for API compatibility if needed
+            else:
+                format_ext = part.lstrip('.')
+        
+        # Case 2: Content Variant (key=value)
+        elif '=' in part:
+            key, value = part.split('=', 1)
+            variants[key.strip()] = value.strip()
+            
+        # Case 3: Standalone tag (treat as boolean variant or ignore? 
+        # For now, we assume it's a value for a default key or warn)
+        else:
+             print(f"WARNING: Unrecognized modifier '{part}' in distribution. Expected '.ext' or 'key=val'.")
+
+    return {
+        "url": url,
+        "variants": variants,
+        "formatExtension": format_ext,
+        "compression": compression
+    }
+
+
 @click.group()
 def app():
     """Databus Client CLI"""
@@ -81,9 +127,16 @@ def deploy(
         click.echo("[MODE] Classic deploy with distributions")
         click.echo(f"Deploying dataset version: {version_id}")
 
+        # --- CHANGE START ---
+        # Parse the input strings into structured objects
+        parsed_distributions = [parse_distribution_str(d) for d in distributions]
+        
+        # Note: api_deploy.create_dataset now accepts this list of dicts
         dataid = api_deploy.create_dataset(
-            version_id, title, abstract, description, license_url, distributions
+            version_id, title, abstract, description, license_url, parsed_distributions
         )
+        # --- CHANGE END ---
+
         api_deploy.deploy(dataid=dataid, api_key=apikey)
         return
 
diff --git a/tests/test_deploy.py b/tests/test_deploy.py
@@ -11,10 +11,41 @@
     _get_content_variants,
     BadArgumentException,
 )
+from databusclient.cli import parse_distribution_str
 
 EXAMPLE_URL = "https://raw.githubusercontent.com/dbpedia/databus/608482875276ef5df00f2360a2f81005e62b58bd/server/app/api/swagger.yml"
 
 
+def test_parse_distribution_str():
+    """Test the new pipe-separated distribution string parser"""
+    
+    # Test with multiple content variants
+    result = parse_distribution_str("http://example.com/data|lang=en|type=full|sorted=true|.ttl|.gz")
+    assert result["url"] == "http://example.com/data"
+    assert result["variants"] == {"lang": "en", "type": "full", "sorted": "true"}
+    assert result["formatExtension"] == "ttl"
+    assert result["compression"] == "gz"
+    
+    # Test with single content variant
+    result = parse_distribution_str("http://mysite.com/data.json|lang=fr|.json")
+    assert result["url"] == "http://mysite.com/data.json"
+    assert result["variants"] == {"lang": "fr"}
+    assert result["formatExtension"] == "json"
+    assert result["compression"] is None
+    
+    # Test URL only
+    result = parse_distribution_str("http://example.com/file.csv")
+    assert result["url"] == "http://example.com/file.csv"
+    assert result["variants"] == {}
+    assert result["formatExtension"] is None
+    assert result["compression"] is None
+    
+    # Test with compression only (no format extension)
+    result = parse_distribution_str("http://example.com/data|.gz")
+    assert result["url"] == "http://example.com/data"
+    assert result["compression"] == "gz"
+
+
 def test_get_content_variants():
     # With content variants
     cvs = _get_content_variants(