Skip to content

Commit b6f9e03

Browse files
committed
Add support for published files through native API
1 parent 676137c commit b6f9e03

4 files changed

Lines changed: 122 additions & 26 deletions

File tree

dataverse/dataset.py

Lines changed: 83 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from exceptions import (
99
MethodNotAllowedError, NoContainerError, OperationFailedError,
10-
ConnectionError,
10+
ConnectionError, MetadataNotFoundError
1111
)
1212
from file import DataverseFile
1313
from settings import SWORD_BOOTSTRAP
@@ -22,14 +22,16 @@ def __init__(self, entry=SWORD_BOOTSTRAP, dataverse=None, edit_uri=None,
2222
This can be specified in the atom entry or as kwargs
2323
"""
2424
self.dataverse = dataverse
25-
self._statement = None
26-
self._state = None
2725

2826
self.edit_uri = edit_uri
2927
self.edit_media_uri = edit_media_uri
3028
self.statement_uri = statement_uri
3129

3230
self._entry = etree.XML(entry) if isinstance(entry, str) else entry
31+
self._statement = None
32+
self._state = None
33+
self._json = None
34+
self._id = None
3335

3436
# Updates sword entry from keyword arguments
3537
for key, value in kwargs.iteritems():
@@ -72,8 +74,49 @@ def from_dataverse(cls, entry_element, dataverse):
7274

7375
@property
7476
def doi(self):
77+
if not self.dataverse:
78+
raise NoContainerError('This dataset has not been added to a Dataverse.')
79+
7580
# Note: This depends strongly on URL structure, and may break easily
76-
return self.edit_media_uri.rsplit("/study/")[-1]
81+
return self.edit_media_uri.rsplit("/study/", 1)[-1]
82+
83+
@property
84+
def id(self):
85+
if self._id:
86+
return self._id
87+
88+
if not self.dataverse:
89+
raise NoContainerError('This dataset has not been added to a Dataverse.')
90+
91+
for dataset in self.dataverse.get_contents(refresh=True):
92+
doi = '{0}:{1}/{2}'.format(
93+
dataset['protocol'],
94+
dataset['authority'],
95+
dataset['identifier'],
96+
)
97+
if doi == self.doi:
98+
self._id = dataset['id']
99+
return self._id
100+
101+
raise MetadataNotFoundError('The dataset ID could not be found.')
102+
103+
def get_contents(self, refresh=False):
104+
if not refresh and self._contents_json:
105+
return self._contents_json
106+
107+
content_uri = 'https://{0}/api/dataverses/{1}/contents'.format(
108+
self.connection.host, self.alias
109+
)
110+
resp = requests.get(
111+
content_uri,
112+
params={'key': self.connection.token}
113+
)
114+
115+
if resp.status_code != 200:
116+
raise ConnectionError('Atom entry could not be retrieved.')
117+
118+
self._contents_json = resp.json()
119+
return self._contents_json
77120

78121
@property
79122
def citation(self):
@@ -145,21 +188,48 @@ def get_state(self, refresh=False):
145188
).text
146189
return self._state
147190

148-
def get_file(self, file_name, published=False):
149-
files = self.get_files(published)
191+
def get_json(self, refresh=False):
192+
if not refresh and self._json:
193+
return self._json
194+
195+
if not self.dataverse:
196+
raise NoContainerError('This dataset has not been added to a Dataverse.')
197+
198+
# TODO: Allow specification of other versions
199+
json_url = 'https://{0}/api/datasets/{1}/versions/:latest-published'.format(
200+
self.connection.host,
201+
self.id
202+
)
203+
204+
resp = requests.get(json_url, params={'key': self.connection.token})
205+
206+
if resp.status_code != 200:
207+
raise ConnectionError('JSON metadata could not be retrieved.')
208+
209+
self._json = resp.json()['data']
210+
return self._json
211+
212+
def get_file(self, file_name, published=False, refresh=True):
213+
files = self.get_files(published, refresh)
150214
return next((f for f in files if f.name == file_name), None)
151215

152-
def get_file_by_id(self, file_id, published=False):
153-
files = self.get_files(published)
216+
def get_file_by_id(self, file_id, published=False, refresh=True):
217+
files = self.get_files(published, refresh)
154218
return next((f for f in files if f.id == file_id), None)
155219

156220
def get_files(self, published=False, refresh=True):
157-
if self.get_state(refresh) == 'DRAFT' and published:
158-
return []
159-
elements = get_elements(self.get_statement(), 'entry')
160-
return [DataverseFile.from_statement(element, self)
221+
if published:
222+
return self.get_published_files(refresh)
223+
224+
# TODO: Should the native API be preferred?
225+
elements = get_elements(self.get_statement(refresh), 'entry')
226+
return [DataverseFile.from_statement(self, element)
161227
for element in elements]
162228

229+
def get_published_files(self, refresh=True):
230+
return [DataverseFile.from_json(self, file_json)
231+
for file_json in self.get_json(refresh)['files']]
232+
163233
def add_file(self, filepath):
164234
self.add_files([filepath])
165235

@@ -235,6 +305,7 @@ def delete_all_files(self):
235305
self.delete_file(f)
236306

237307
# TODO: DANGEROUS! Will delete all unspecified fields! Deposit receipts only give SOME of the fields
308+
# Can potentially be replaced with native API functionality
238309
# def update_metadata(self):
239310
# depositReceipt = self.hostDataverse.connection.sword.update(
240311
# dr=self.lastDepositReceipt,

dataverse/dataverse.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from dataset import Dataset
44
from exceptions import (
55
InsufficientMetadataError, MethodNotAllowedError, OperationFailedError,
6+
ConnectionError
67
)
78
from utils import get_element, get_elements, sanitize
89

@@ -11,6 +12,7 @@ class Dataverse(object):
1112
def __init__(self, connection, collection):
1213
self.connection = connection
1314
self.collection = collection
15+
self._contents_json = None
1416

1517
@property
1618
def is_published(self):
@@ -41,6 +43,24 @@ def title(self):
4143
tag='title',
4244
).text)
4345

46+
def get_contents(self, refresh=False):
47+
if not refresh and self._contents_json:
48+
return self._contents_json
49+
50+
content_uri = 'https://{0}/api/dataverses/{1}/contents'.format(
51+
self.connection.host, self.alias
52+
)
53+
resp = requests.get(
54+
content_uri,
55+
params={'key': self.connection.token}
56+
)
57+
58+
if resp.status_code != 200:
59+
raise ConnectionError('Atom entry could not be retrieved.')
60+
61+
self._contents_json = resp.json()['data']
62+
return self._contents_json
63+
4464
def publish(self):
4565
edit_uri = 'https://{0}/dvn/api/data-deposit/v1.1/swordv2/edit/dataverse/{1}'.format(
4666
self.connection.host, self.alias

dataverse/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,8 @@ class ConnectionError(DataverseError):
3030

3131
class OperationFailedError(DataverseError):
3232
"""Raised when an operation fails for an unknown reason"""
33+
pass
34+
35+
class MetadataNotFoundError(DataverseError):
36+
"""Raised when metadata cannot be found for an unknown reason"""
3337
pass

dataverse/file.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,36 @@
55

66

77
class DataverseFile(object):
8-
def __init__(self, name, dataset, edit_media_uri=None, download_url=None):
9-
self.name = sanitize(name)
8+
def __init__(self, dataset, name, file_id=None, edit_media_uri=None):
109
self.dataset = dataset
10+
self.name = sanitize(name)
1111

1212
if edit_media_uri:
1313
self.is_published = False
1414
self.edit_media_uri = edit_media_uri
1515
self.id = edit_media_uri.split('/')[-2]
16-
host = urlparse.urlparse(edit_media_uri).netloc
1716
self.download_url = 'http://{0}/api/access/datafile/{1}'.format(
18-
host, self.id
17+
dataset.connection.host, self.id
1918
)
20-
elif download_url:
19+
elif file_id:
2120
self.is_published = True
22-
self.download_url = download_url
23-
self.id = download_url.rsplit('=', 1)[-1]
21+
self.id = file_id
22+
self.download_url = 'http://{0}/api/access/datafile/{1}'.format(
23+
dataset.connection.host, self.id
24+
)
2425
else:
2526
raise InsufficientMetadataError(
26-
'Files must have an edit media uri or download url.'
27+
'Files must have a file id or edit media uri.'
2728
)
2829

2930
@classmethod
30-
def from_statement(cls, element, dataset):
31+
def from_statement(cls, dataset, element):
3132
edit_media_uri = get_element(element, 'content').get('src')
3233
name = edit_media_uri.rsplit("/", 1)[-1]
33-
return cls(name, dataset, edit_media_uri=edit_media_uri)
34+
return cls(dataset, name, edit_media_uri=edit_media_uri)
3435

3536
@classmethod
36-
def from_metadata(cls, element, dataset):
37-
name = element[0].text
38-
download_url = element.attrib.get('URI')
39-
return cls(name, dataset, download_url=download_url)
37+
def from_json(cls, dataset, json):
38+
name = json['datafile']['name']
39+
file_id = json['datafile']['id']
40+
return cls(dataset, name, file_id)

0 commit comments

Comments
 (0)