Skip to content

Commit 41e9945

Browse files
committed
Make the opengeometadata indexer more robust
- Gracefully handle indexing failures - Use Blacklight's solr connection if available Closes #168 Closes #166
1 parent da67ae7 commit 41e9945

3 files changed

Lines changed: 197 additions & 46 deletions

File tree

geo_combine.gemspec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
2626
spec.add_dependency 'thor'
2727
spec.add_dependency 'faraday-net_http_persistent', '~> 2.0'
2828
spec.add_dependency 'git'
29+
spec.add_dependency 'faraday-retry', '~> 2.2'
2930

3031
spec.add_development_dependency 'bundler'
3132
spec.add_development_dependency 'rake'

lib/geo_combine/indexer.rb

Lines changed: 104 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,126 @@
11
# frozen_string_literal: true
22

33
require 'rsolr'
4+
require 'faraday/retry'
45
require 'faraday/net_http_persistent'
6+
require 'geo_combine/logger'
57

68
module GeoCombine
79
# Indexes Geoblacklight documents into Solr
810
class Indexer
911
attr_reader :solr
1012

11-
def self.solr(url: ENV.fetch('SOLR_URL', 'http://127.0.0.1:8983/solr/blacklight-core'))
12-
RSolr.connect url: url, adapter: :net_http_persistent
13+
def initialize(solr: nil, logger: GeoCombine::Logger.logger)
14+
@logger = logger
15+
@batch_size = ENV.fetch('SOLR_BATCH_SIZE', 100).to_i
16+
17+
# If SOLR_URL is set, use it; if in a Geoblacklight app, use its solr core
18+
solr_url = ENV.fetch('SOLR_URL', nil)
19+
solr_url ||= Blacklight.default_index.connection.base_uri.to_s if defined? Blacklight
20+
21+
# If neither, warn and try to use local Blacklight default solr core
22+
if solr_url.nil?
23+
@logger.warn 'SOLR_URL not set; using Blacklight default'
24+
solr_url = 'http://localhost:8983/solr/blacklight-core'
25+
end
26+
27+
@solr = solr || RSolr.connect(client, url: solr_url)
1328
end
1429

15-
def initialize(solr: GeoCombine::Indexer.solr)
16-
@solr = solr
30+
# Index everything and return the number of docs successfully indexed
31+
def index(docs)
32+
# Track total indexed and time spent
33+
@logger.info "indexing into #{solr_url}"
34+
total_indexed = 0
35+
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
36+
37+
# Index in batches; set batch size via BATCH_SIZE
38+
batch = []
39+
docs.each do |doc, path|
40+
if batch.size < @batch_size
41+
batch << [doc, path]
42+
else
43+
total_indexed += index_batch(batch)
44+
batch = []
45+
end
46+
end
47+
total_indexed += index_batch(batch) unless batch.empty?
48+
49+
# Issue a commit to make sure all documents are indexed
50+
@solr.commit
51+
end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
52+
sec = end_time - start_time
53+
@logger.info format('indexed %<total_indexed>d documents in %<sec>.2f seconds', total_indexed:, sec:)
54+
total_indexed
1755
end
1856

57+
# URL to the solr instance being used
1958
def solr_url
2059
@solr.options[:url]
2160
end
2261

23-
# Index everything and return the number of docs successfully indexed
24-
def index(docs, commit_within: ENV.fetch('SOLR_COMMIT_WITHIN', 5000).to_i)
25-
indexed_count = 0
26-
27-
docs.each do |record, path|
28-
# log the unique identifier for the record for debugging
29-
id = record['id'] || record['dc_identifier_s']
30-
puts "Indexing #{id}: #{path}" if $DEBUG
31-
32-
# index the record into solr
33-
@solr.update params: { commitWithin: commit_within, overwrite: true },
34-
data: [record].to_json,
35-
headers: { 'Content-Type' => 'application/json' }
36-
37-
# count the number of records successfully indexed
38-
indexed_count += 1
39-
rescue RSolr::Error::Http => e
40-
puts e
41-
end
62+
private
4263

43-
@solr.commit
44-
indexed_count
64+
# Index a batch of documents; if we fail, index them all individually
65+
def index_batch(batch)
66+
docs = batch.map(&:first)
67+
@solr.update(data: batch_json(docs), params:, headers:)
68+
@logger.debug "indexed batch (#{batch.size} docs)"
69+
batch.size
70+
rescue RSolr::Error::Http => e
71+
@logger.error "error indexing batch (#{batch.size} docs): #{format_error(e)}"
72+
@logger.warn 'retrying documents individually'
73+
batch.map { |doc, path| index_single(doc, path) }.compact.size
74+
end
75+
76+
# Index a single document; if it fails, log the error and continue
77+
def index_single(doc, path)
78+
@solr.add(doc, params:, headers:)
79+
@logger.debug "indexed #{path}"
80+
doc
81+
rescue RSolr::Error::Http => e
82+
@logger.error "error indexing #{path}: #{format_error(e)}"
83+
nil
84+
end
85+
86+
# Generate a JSON string to send to solr update API for a batch of documents
87+
def batch_json(batch)
88+
batch.map { |doc| "add: { doc: #{doc.to_json} }" }.join(",\n").prepend('{ ').concat(' }')
89+
end
90+
91+
# Generate a friendly error message for logging including status code and message
92+
def format_error(error)
93+
code = error.response[:status]
94+
status_info = "#{code} #{RSolr::Error::Http::STATUS_CODES[code.to_i]}"
95+
error_info = parse_solr_error(error)
96+
[status_info, error_info].compact.join(' - ')
97+
end
98+
99+
# Extract the specific error message from a solr JSON error response, if any
100+
def parse_solr_error(error)
101+
JSON.parse(error.response[:body]).dig('error', 'msg')
102+
rescue StandardError
103+
nil
104+
end
105+
106+
def headers
107+
{ 'Content-Type' => 'application/json' }
108+
end
109+
110+
def params
111+
{ overwrite: true }
112+
end
113+
114+
def client
115+
@client ||= Faraday.new do |conn|
116+
conn.request :retry, max: 3, interval: 1, backoff_factor: 2, exceptions: [
117+
Faraday::TimeoutError,
118+
Faraday::ConnectionFailed,
119+
Faraday::TooManyRequestsError
120+
]
121+
conn.response :raise_error
122+
conn.adapter :net_http_persistent
123+
end
45124
end
46125
end
47126
end

spec/lib/geo_combine/indexer_spec.rb

Lines changed: 92 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,22 @@
33
require 'geo_combine/indexer'
44
require 'spec_helper'
55

6+
# Mock an available Blacklight installation
7+
class FakeBlacklight
8+
def self.default_index
9+
Repository
10+
end
11+
12+
class Repository
13+
def self.connection; end
14+
end
15+
end
16+
617
RSpec.describe GeoCombine::Indexer do
7-
subject(:indexer) { described_class.new(solr: solr) }
18+
subject(:indexer) { described_class.new(solr:, logger:) }
819

9-
let(:solr) { instance_double(RSolr::Client) }
20+
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
21+
let(:solr) { instance_double(RSolr::Client, options: { url: 'TEST' }) }
1022
let(:docs) do
1123
[
1224
[{ 'id' => '1' }, 'path/to/record1.json'], # v1.0 schema
@@ -21,36 +33,69 @@
2133

2234
describe '#initialize' do
2335
before do
24-
stub_const('ENV', 'SOLR_URL' => 'http://localhost:8983/solr/geoblacklight')
2536
allow(RSolr).to receive(:connect).and_return(solr)
2637
end
2738

28-
it 'connects to a solr instance if set in the environment' do
29-
described_class.new
30-
expect(RSolr).to have_received(:connect).with(
31-
url: 'http://localhost:8983/solr/geoblacklight',
32-
adapter: :net_http_persistent
33-
)
39+
context 'when solr url is set in the environment' do
40+
before do
41+
stub_const('ENV', 'SOLR_URL' => 'http://localhost:8983/solr/geoblacklight')
42+
end
43+
44+
it 'connects to the solr instance' do
45+
described_class.new(logger:)
46+
expect(RSolr).to have_received(:connect).with(
47+
be_a(Faraday::Connection),
48+
url: 'http://localhost:8983/solr/geoblacklight'
49+
)
50+
end
51+
end
52+
53+
context 'when there is a configured Blacklight connection' do
54+
before do
55+
stub_const('Blacklight', FakeBlacklight)
56+
allow(FakeBlacklight::Repository).to receive(:connection).and_return(
57+
instance_double(RSolr::Client, base_uri: URI('http://localhost:8983/solr/geoblacklight'))
58+
)
59+
end
60+
61+
it 'connects to the solr instance' do
62+
described_class.new(logger:)
63+
expect(RSolr).to have_received(:connect).with(
64+
be_a(Faraday::Connection),
65+
url: 'http://localhost:8983/solr/geoblacklight'
66+
)
67+
end
68+
end
69+
70+
context 'when solr url is not set' do
71+
before do
72+
stub_const('ENV', {})
73+
end
74+
75+
it 'falls back to the Blacklight default' do
76+
described_class.new(logger:)
77+
expect(RSolr).to have_received(:connect).with(
78+
be_a(Faraday::Connection),
79+
url: 'http://localhost:8983/solr/blacklight-core'
80+
)
81+
end
3482
end
3583
end
3684

3785
describe '#index' do
38-
it 'posts each record to solr as JSON' do
39-
indexer.index([docs[0]], commit_within: 1)
86+
let(:solr_error_msg) { { error: { msg: 'error message' } }.to_json }
87+
let(:solr_response) { { status: '400', body: solr_error_msg } }
88+
let(:error) { RSolr::Error::Http.new({ uri: URI('') }, solr_response) }
89+
90+
it 'sends records in batches to solr' do
91+
indexer.index(docs)
4092
expect(solr).to have_received(:update).with(
41-
params: { commitWithin: 1, overwrite: true },
42-
data: [docs[0][0]].to_json,
43-
headers: { 'Content-Type' => 'application/json' }
93+
data: "{ add: { doc: {\"id\":\"1\"} },\nadd: { doc: {\"dc_identifier_s\":\"2\"} } }",
94+
headers: { 'Content-Type' => 'application/json' },
95+
params: { overwrite: true }
4496
)
4597
end
4698

47-
it 'prints the id and path of each record in debug mode' do
48-
$DEBUG = true
49-
expect { indexer.index([docs[0]]) }.to output("Indexing 1: path/to/record1.json\n").to_stdout
50-
expect { indexer.index([docs[1]]) }.to output("Indexing 2: path/to/record2.json\n").to_stdout
51-
$DEBUG = false
52-
end
53-
5499
it 'commits changes to solr after indexing' do
55100
indexer.index(docs)
56101
expect(solr).to have_received(:commit).once
@@ -59,5 +104,31 @@
59104
it 'returns the count of records successfully indexed' do
60105
expect(indexer.index(docs)).to eq 2
61106
end
107+
108+
context 'when an error occurs during batch indexing' do
109+
before do
110+
allow(solr).to receive(:update).and_raise(error)
111+
allow(solr).to receive(:add)
112+
end
113+
114+
it 'attempts to index records individually' do
115+
total = indexer.index(docs)
116+
expect(solr).to have_received(:add).twice
117+
expect(total).to eq 2
118+
end
119+
end
120+
121+
context 'when an error occurs during individual indexing' do
122+
before do
123+
allow(solr).to receive(:update).and_raise(error)
124+
allow(solr).to receive(:add).with(docs[0][0], anything).and_raise(error)
125+
allow(solr).to receive(:add).with(docs[1][0], anything)
126+
end
127+
128+
it 'continues indexing' do
129+
total = indexer.index(docs)
130+
expect(total).to eq 1
131+
end
132+
end
62133
end
63134
end

0 commit comments

Comments
 (0)