@@ -155,7 +155,7 @@ def list_sets(opts={})
155155 def do_request ( verb , opts = nil )
156156 # fire off the request and return appropriate DOM object
157157 uri = build_uri ( verb , opts )
158- xml = get ( uri )
158+ xml = strip_invalid_utf_8_chars ( get ( uri ) )
159159 if @parser == 'libxml'
160160 # remove default namespace for oai-pmh since libxml
161161 # isn't able to use our xpaths to get at them
@@ -184,36 +184,20 @@ def encode(value)
184184 end
185185
186186 def load_document ( xml )
187- retried = false
188187 case @parser
189188 when 'libxml'
190189 begin
191190 parser = XML ::Parser . new ( )
192191 parser . string = xml
193192 return parser . parse
194193 rescue XML ::Parser ::ParseError => e
195- if retried
196- raise OAI ::Exception , 'response not well formed XML: ' +e , caller
197- end
198- ic = Iconv . new ( 'UTF-8//IGNORE' , 'UTF-8' )
199- xml2 = ic . iconv ( xml << ' ' ) [ 0 ..-2 ]
200- puts "equal? #{ xml == xml2 } "
201- retried = true
202- retry
194+ raise OAI ::Exception , 'response not well formed XML: ' +e , caller
203195 end
204196 when 'rexml'
205197 begin
206198 return REXML ::Document . new ( xml )
207199 rescue REXML ::ParseException => e
208- if retried
209- puts xml
210- raise OAI ::Exception , 'response not well formed XML: ' +e , caller
211- end
212- puts "RETRYING"
213- ic = Iconv . new ( 'UTF-8//IGNORE' , 'UTF-8' )
214- xml = ic . iconv ( xml << ' ' ) [ 0 ..-2 ]
215- retried = true
216- retry
200+ raise OAI ::Exception , 'response not well formed XML: ' +e , caller
217201 end
218202 end
219203 end
@@ -296,5 +280,19 @@ def parse_date(value)
296280 dt . utc
297281 end
298282
283+
284+ # Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
285+ # http://www.w3.org/International/questions/qa-forms-utf-8.en.php
286+ def strip_invalid_utf_8_chars ( xml )
287+ simple_bytes = xml . gsub ( /[\x00 -\x08 \x10 \x0B \x0C \x0E -\x19 \x7F ]
288+ | [\x00 -\x7F ][\x80 -\xBF ]+
289+ | ([\xC0 \xC1 ]|[\xF0 -\xFF ])[\x80 -\xBF ]*
290+ | [\xC2 -\xDF ]((?![\x80 -\xBF ])|[\x80 -\xBF ]{2,})
291+ | [\xE0 -\xEF ](([\x80 -\xBF ](?![\x80 -\xBF ]))
292+ | (?![\x80 -\xBF ]{2})|[\x80 -\xBF ]{3,})/x , '?' )
293+ simple_bytes . gsub ( /\xE0 [\x80 -\x9F ][\x80 -\xBF ]
294+ | \xED [\xA0 -\xBF ][\x80 -\xBF ]/ , '?' )
295+ end
296+
299297 end
300298end
0 commit comments