Skip to content

Commit ed1981a

Browse files
committed
Try to escape invalid UTF-8 characters during harvesting.
1 parent 4248efc commit ed1981a

6 files changed

Lines changed: 33 additions & 22 deletions

File tree

lib/oai/client.rb

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def list_sets(opts={})
155155
def do_request(verb, opts = nil)
156156
# fire off the request and return appropriate DOM object
157157
uri = build_uri(verb, opts)
158-
xml = get(uri)
158+
xml = strip_invalid_utf_8_chars(get(uri))
159159
if @parser == 'libxml'
160160
# remove default namespace for oai-pmh since libxml
161161
# isn't able to use our xpaths to get at them
@@ -184,36 +184,20 @@ def encode(value)
184184
end
185185

186186
def load_document(xml)
187-
retried = false
188187
case @parser
189188
when 'libxml'
190189
begin
191190
parser = XML::Parser.new()
192191
parser.string = xml
193192
return parser.parse
194193
rescue XML::Parser::ParseError => e
195-
if retried
196-
raise OAI::Exception, 'response not well formed XML: '+e, caller
197-
end
198-
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
199-
xml2 = ic.iconv(xml << ' ')[0..-2]
200-
puts "equal? #{xml == xml2}"
201-
retried = true
202-
retry
194+
raise OAI::Exception, 'response not well formed XML: '+e, caller
203195
end
204196
when 'rexml'
205197
begin
206198
return REXML::Document.new(xml)
207199
rescue REXML::ParseException => e
208-
if retried
209-
puts xml
210-
raise OAI::Exception, 'response not well formed XML: '+e, caller
211-
end
212-
puts "RETRYING"
213-
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
214-
xml = ic.iconv(xml << ' ')[0..-2]
215-
retried = true
216-
retry
200+
raise OAI::Exception, 'response not well formed XML: '+e, caller
217201
end
218202
end
219203
end
@@ -296,5 +280,19 @@ def parse_date(value)
296280
dt.utc
297281
end
298282

283+
284+
# Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
285+
# http://www.w3.org/International/questions/qa-forms-utf-8.en.php
286+
def strip_invalid_utf_8_chars(xml)
287+
simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
288+
| [\x00-\x7F][\x80-\xBF]+
289+
| ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
290+
| [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
291+
| [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
292+
| (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
293+
simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
294+
| \xED[\xA0-\xBF][\x80-\xBF]/,'?')
295+
end
296+
299297
end
300298
end

lib/oai/harvester.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
require 'chronic'
1010
require 'socket'
1111

12+
require 'oai/client'
1213
require 'oai/harvester/config'
1314
require 'oai/harvester/harvest'
1415
require 'oai/harvester/logging'

lib/oai/harvester/harvest.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def call(url, opts)
107107
end
108108

109109
def get_records(doc)
110-
doc.find("/OAI-PMH/ListRecords/record").to_a
110+
doc.doc.root.elements.to_a("/OAI-PMH/ListRecords/record")
111111
end
112112

113113
def build_options_hash(site)

lib/oai/harvester/shell.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def start
4747
end
4848
rescue
4949
puts "Not a recognized command, or bad options. Type 'help' for clues."
50-
#puts $!
51-
#puts $!.backtrace.join("\n")
50+
puts $!
51+
puts $!.backtrace.join("\n")
5252
end
5353
end
5454
end

test/client/tc_list_records.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
require 'test_helper'
22

33
class GetRecordsTest < Test::Unit::TestCase
4+
45
def test_get_records
56
client = OAI::Client.new 'http://localhost:3333/oai'
67
response = client.list_records

test/client/tc_utf8_escaping.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
require 'test_helper'
2+
3+
class UTF8Test < Test::Unit::TestCase
4+
5+
def test_escaping_invalid_utf_8_characters
6+
client = OAI::Client.new 'http://localhost:3333/oai', :parser => 'libxml'
7+
invalid_utf_8 = [2, 3, 4, 104, 5, 101, 6, 108, 66897, 108, 66535, 111, 1114112, 33, 55234123, 33].pack("U*")
8+
assert_equal("hello!!", client.send(:strip_invalid_utf_8_chars, invalid_utf_8).gsub(/\?/, ''))
9+
end
10+
11+
end

0 commit comments

Comments
 (0)