|
1 | 1 | #!/usr/bin/python |
2 | 2 |
|
3 | 3 | import sys |
4 | | -import urllib2 |
| 4 | +import urllib.request |
5 | 5 | import json |
6 | 6 | import os |
7 | 7 | import base64, uuid |
8 | 8 | import re |
| 9 | +import logging |
9 | 10 |
|
10 | 11 | CACHEDIR="cache" |
11 | 12 | if not os.path.exists(CACHEDIR): |
12 | 13 | os.makedirs(CACHEDIR) |
13 | 14 |
|
14 | 15 | def cacheFilename(url): |
15 | | - z=base64.b64encode(url).rstrip('=\n').replace('/', '_') |
| 16 | + z=base64.b64encode(url.encode("utf-8")).decode("utf-8").rstrip('=\n').replace('/', '_') |
16 | 17 | return z + '.json' |
17 | 18 |
|
18 | 19 | def fetch(url, retry=0): |
19 | 20 | cached = os.path.join(CACHEDIR, cacheFilename(url)) |
20 | 21 | if os.path.exists(cached): |
21 | | - fh = file(cached) |
22 | | - data = fh.read() |
23 | | - fh.close() |
| 22 | + logging.debug('Getting %s from cache: %s',url, cached) |
| 23 | + with open(cached, encoding = 'utf-8') as fh: |
| 24 | + data = json.loads(fh.read()) |
24 | 25 | else: |
25 | | - request = urllib2.Request(url) |
| 26 | + request = urllib.request.Request(url) |
26 | 27 | if '@' in url: |
27 | 28 | result= re.search(r"\/\/(.*)@", url) |
28 | 29 | url = re.sub(r"\/\/*.*@", r'//', url) |
29 | | - request = urllib2.Request(url) |
| 30 | + request = urllib.request.Request(url) |
30 | 31 | if result: |
31 | 32 | base64string = base64.b64encode(result.group(1)) |
32 | 33 | request.add_header("Authorization", "Basic %s" % base64string) |
33 | 34 |
|
34 | 35 | try: |
35 | | - fh = urllib2.urlopen(request) |
36 | | - data = fh.read() |
| 36 | + fh = urllib.request.urlopen(request) |
| 37 | + data = json.loads(fh.read()) |
37 | 38 | fh.close() |
38 | | - fh = file(cached, 'w') |
39 | | - fh.write(data) |
40 | | - fh.close() |
41 | | - except urllib2.HTTPError as error: |
42 | | - print("Getting " + url + " failed due to " + str(error.code) + " " + error.reason + " retry " + str(retry)) |
| 39 | + |
| 40 | + with open(cached, 'w', encoding='utf-8') as f: |
| 41 | + json.dump(data, f, ensure_ascii=False, indent=4) |
| 42 | + |
| 43 | + except urllib.request.HTTPError as error: |
| 44 | + data = None |
| 45 | + # 404 means no annotations for this canvas |
| 46 | + logging.error("Getting %s failed due to %s: %s (Rery: %s)", url, error.code, error.reason, retry) |
43 | 47 | if error.code == 500 and retry < 5: |
44 | 48 | return fetch(url, retry+1) |
| 49 | + except urllib.error.URLError as error: |
| 50 | + data = None |
| 51 | + # 404 means no annotations for this canvas |
| 52 | + logging.error("Failed to get %s due to %s. Do you have the correct URL for SAS and is it running?", url, error) |
45 | 53 |
|
46 | 54 | return data |
47 | 55 | if __name__ == "__main__": |
48 | | - |
| 56 | + logging.basicConfig( encoding='utf-8', level=logging.ERROR) |
49 | 57 | if len(sys.argv) < 4: |
50 | 58 | print("Usage:\n\tdownloadAnnotationListsByCanvas.py [manifest] [sas_endpoint] [output_dir] [optional outputfilename proc]") |
51 | 59 | print ("Arg no = %s" % len(sys.argv)) |
52 | 60 | sys.exit(0) |
53 | 61 |
|
54 | | - print ("Downloading manifest") |
55 | | - manifest = json.loads(fetch(sys.argv[1])) |
| 62 | + print ("Downloading manifest: {}".format(sys.argv[1])) |
| 63 | + manifest = fetch(sys.argv[1]) |
| 64 | + if not manifest: |
| 65 | + print ('Failed to load manifest') |
| 66 | + exit(-1) |
| 67 | + sasEndpoint = sys.argv[2] |
| 68 | + if sasEndpoint.endswith('/'): |
| 69 | + # remove last slash |
| 70 | + sasEndpoint = sasEndpoint[:-1] |
56 | 71 |
|
57 | 72 | count=0 |
58 | 73 | for canvas in manifest["sequences"][0]["canvases"]: |
59 | 74 | count += 1 |
60 | | - print ("Downloading %s " % canvas["@id"]) |
61 | | - annoListData = fetch("%s/annotation/search?uri=%s" % (sys.argv[2], canvas["@id"])) |
62 | | - # add list to resource |
63 | | - annoList = { |
64 | | - "@type" : "sc:AnnotationList", |
65 | | - "context": "http://iiif.io/api/presentation/2/context.json", |
66 | | - "resources": json.loads(annoListData) |
67 | | - } |
68 | | - if len(sys.argv) > 4 and sys.argv[4] == 'nlw': |
69 | | - filename = canvas["@id"].split('/')[-1] |
70 | | - else: |
71 | | - filename = "page%s.json" % count |
72 | | - with open("%s/%s" % (sys.argv[3],filename), 'wb') as outfile: |
73 | | - json.dump(annoList, outfile, sort_keys=False,indent=4, separators=(',', ': ')) |
74 | | - outfile.close() |
| 75 | + annoListData = fetch("%s/annotation/search?uri=%s" % (sasEndpoint, canvas["@id"])) |
| 76 | + if annoListData: |
| 77 | + print ("Downloaded annotations for canvas: {} ".format(canvas["@id"])) |
| 78 | + # add list to resource |
| 79 | + annoList = { |
| 80 | + "@type" : "sc:AnnotationList", |
| 81 | + "context": "http://iiif.io/api/presentation/2/context.json", |
| 82 | + "resources": annoListData |
| 83 | + } |
| 84 | + if len(sys.argv) > 4 and sys.argv[4] == 'nlw': |
| 85 | + filename = canvas["@id"].split('/')[-1] |
| 86 | + else: |
| 87 | + filename = "page%s.json" % count |
| 88 | + |
| 89 | + outputDirectory = sys.argv[3] |
| 90 | + outFilename = "%s/%s" % (outputDirectory,filename) |
| 91 | + if not os.path.exists(outputDirectory): |
| 92 | + os.makedirs(outputDirectory) |
| 93 | + |
| 94 | + with open(outFilename, 'w') as outfile: |
| 95 | + json.dump(annoList, outfile, indent=4) |
| 96 | + print ('Saved file: {}'.format(outFilename)) |
| 97 | + #else: |
| 98 | + # print ('No annotations for canvas: {}'.format(canvas['@id'])) |
75 | 99 | #except: |
76 | 100 | # print (annoListData) |
0 commit comments