Skip to content

Commit f924d56

Browse files
Marvin HoferJJ-Author
authored andcommitted
init download function merge
1 parent 45cc36d commit f924d56

4 files changed

Lines changed: 129 additions & 2 deletions

File tree

databusclient/cli.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,9 @@ def deploy(
3535

3636

3737
@app.command()
38-
def download(collection: str):
39-
typer.echo("TODO")
38+
def download(
39+
localDir: str = typer.Option(..., help="local databus folder"),
40+
databus: str = typer.Option(..., help="databus URL"),
41+
databusURIs: List[str] = typer.Argument(...,help="any kind of these: databus identifier, databus collection identifier, query file")
42+
):
43+
client.download(localDir=localDir,endpoint=databus,databusURIs=databusURIs)

databusclient/client.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import requests
44
import hashlib
55
import json
6+
from tqdm import tqdm
7+
from SPARQLWrapper import SPARQLWrapper, JSON
8+
from hashlib import sha256
69

710
__debug = False
811

@@ -386,3 +389,100 @@ def deploy(
386389
if debug or __debug:
387390
print("---------")
388391
print(resp.text)
392+
393+
394+
def __download_file__(url, filename):
395+
"""
396+
Download a file from the internet with a progress bar using tqdm.
397+
398+
Parameters:
399+
- url: the URL of the file to download
400+
- filename: the local file path where the file should be saved
401+
"""
402+
print("download "+url)
403+
response = requests.get(url, stream=True)
404+
total_size_in_bytes= int(response.headers.get('content-length', 0))
405+
block_size = 1024 # 1 Kibibyte
406+
407+
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
408+
with open(filename, 'wb') as file:
409+
for data in response.iter_content(block_size):
410+
progress_bar.update(len(data))
411+
file.write(data)
412+
progress_bar.close()
413+
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
414+
print("ERROR, something went wrong")
415+
416+
417+
def __query_sparql__(endpoint_url, query)-> dict:
418+
"""
419+
Query a SPARQL endpoint and return results in JSON format.
420+
421+
Parameters:
422+
- endpoint_url: the URL of the SPARQL endpoint
423+
- query: the SPARQL query string
424+
425+
Returns:
426+
- Dictionary containing the query results
427+
"""
428+
sparql = SPARQLWrapper(endpoint_url)
429+
sparql.method = 'POST'
430+
sparql.setQuery(query)
431+
sparql.setReturnFormat(JSON)
432+
results = sparql.query().convert()
433+
return results
434+
435+
436+
def __handle__databus_file_query__(endpoint_url, query) -> List[str]:
437+
result_dict = __query_sparql__(endpoint_url,query)
438+
for binding in result_dict['results']['bindings']:
439+
if len(binding.keys()) > 1:
440+
print("Error multiple bindings in query response")
441+
break
442+
else:
443+
value = binding[next(iter(binding.keys()))]['value']
444+
yield value
445+
446+
447+
def wsha256(raw: str):
448+
return sha256(raw.encode('utf-8')).hexdigest()
449+
450+
451+
def __handle_databus_collection__(endpoint, uri: str)-> str:
452+
headers = {"Accept": "text/sparql"}
453+
return requests.get(uri, headers=headers).text
454+
455+
456+
def __download_list__(urls: List[str], localDir: str):
457+
for url in urls:
458+
__download_file__(url=url,filename=localDir+"/"+wsha256(url))
459+
460+
461+
def download(
462+
localDir: str,
463+
endpoint: str,
464+
databusURIs: List[str]
465+
) -> None:
466+
"""
467+
Download datasets to local storage from databus registry
468+
------
469+
localDir: the local directory
470+
databusURIs: identifiers to access databus registered datasets
471+
"""
472+
for databusURI in databusURIs:
473+
# dataID or databus collection
474+
if databusURI.startswith("http://") or databusURI.startswith("https://"):
475+
# databus collection
476+
if "/collections/" in databusURI:
477+
query = __handle_databus_collection__(endpoint,databusURI)
478+
res = __handle__databus_file_query__(endpoint, query)
479+
else:
480+
print("dataId not supported yet")
481+
# query in local file
482+
elif databusURI.startswith("file://"):
483+
print("query in file not supported yet")
484+
# query as argument
485+
else:
486+
print("QUERY {}", databusURI.replace("\n"," "))
487+
res = __handle__databus_file_query__(endpoint,databusURI)
488+
__download_list__(res,localDir)

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ readme = "README.md"
1010
python = "^3.9"
1111
typer = "^0.6.1"
1212
requests = "^2.28.1"
13+
tqdm = "^2.2.3"
14+
SPARQLWrapper = "^2.0.0"
15+
1316

1417
[tool.poetry.dev-dependencies]
1518
black = "^22.6.0"

tests/test_download.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""Download Tests"""
2+
import pytest
3+
import databusclient.client as cl
4+
5+
DEFAULT_ENDPOINT="https://databus.dbpedia.org/sparql"
6+
TEST_QUERY="""
7+
PREFIX dcat: <http://www.w3.org/ns/dcat#>
8+
SELECT ?x WHERE {
9+
?sub dcat:downloadURL ?x .
10+
} LIMIT 10
11+
"""
12+
TEST_COLLECTION="https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12"
13+
14+
def test_with_query():
15+
cl.download("target",DEFAULT_ENDPOINT,[TEST_QUERY]
16+
17+
)
18+
19+
def test_with_collection():
20+
cl.download("target",DEFAULT_ENDPOINT,[TEST_COLLECTION])

0 commit comments

Comments
 (0)