Skip to content

Commit 6b2b549

Browse files
committed
Use native ruby logging capabilities for OGM tasks
Closes #138
1 parent 41e9945 commit 6b2b549

6 files changed

Lines changed: 80 additions & 59 deletions

File tree

lib/geo_combine/geo_blacklight_harvester.rb

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# frozen_string_literal: true
22

3+
require 'geo_combine/logger'
4+
35
module GeoCombine
46
##
57
# A class to harvest and index results from GeoBlacklight sites
@@ -45,24 +47,25 @@ def document_transformer
4547

4648
attr_reader :site, :site_key
4749

48-
def initialize(site_key)
50+
def initialize(site_key, logger: GeoCombine::Logger.logger)
4951
@site_key = site_key
5052
@site = self.class.config[site_key]
53+
@logger = logger
5154

5255
raise ArgumentError, "Site key #{@site_key.inspect} is not configured for #{self.class.name}" unless @site
5356
end
5457

5558
def index
56-
puts "Fetching page 1 @ #{base_url}&page=1" if self.class.config[:debug]
59+
@logger.debug "fetching page 1 @ #{base_url}&page=1"
5760
response = JSON.parse(Net::HTTP.get(URI("#{base_url}&page=1")))
5861
response_class = BlacklightResponseVersionFactory.call(response)
5962

60-
response_class.new(response: response, base_url: base_url).documents.each do |docs|
63+
response_class.new(response:, base_url:, logger: @logger).documents.each do |docs|
6164
docs.map! do |document|
6265
self.class.document_transformer&.call(document)
6366
end.compact
6467

65-
puts "Adding #{docs.count} documents to solr" if self.class.config[:debug]
68+
@logger.debug "adding #{docs.count} documents to solr"
6669
solr_connection.update params: { commitWithin: commit_within, overwrite: true },
6770
data: docs.to_json,
6871
headers: { 'Content-Type' => 'application/json' }
@@ -91,10 +94,11 @@ class LegacyBlacklightResponse
9194
attr_reader :base_url
9295
attr_accessor :response, :page
9396

94-
def initialize(response:, base_url:)
97+
def initialize(response:, base_url:, logger: GeoCombine::Logger.logger)
9598
@base_url = base_url
9699
@response = response
97100
@page = 1
101+
@logger = logger
98102
end
99103

100104
def documents
@@ -106,12 +110,12 @@ def documents
106110
break if current_page == total_pages
107111

108112
self.page += 1
109-
puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
113+
@logger.debug "fetching page #{page} @ #{url}"
110114

111115
begin
112116
self.response = JSON.parse(Net::HTTP.get(URI(url)))
113117
rescue StandardError => e
114-
puts "Request for #{url} failed with #{e}"
118+
@logger.error "request for #{url} failed with #{e}"
115119
self.response = nil
116120
end
117121
end
@@ -138,10 +142,11 @@ class ModernBlacklightResponse
138142
attr_reader :base_url
139143
attr_accessor :response, :page
140144

141-
def initialize(response:, base_url:)
145+
def initialize(response:, base_url:, logger: GeoCombine::Logger.logger)
142146
@base_url = base_url
143147
@response = response
144148
@page = 1
149+
@logger = logger
145150
end
146151

147152
def documents
@@ -157,11 +162,11 @@ def documents
157162

158163
url = "#{url}&format=json"
159164
self.page += 1
160-
puts "Fetching page #{page} @ #{url}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
165+
@logger.debug "fetching page #{page} @ #{url}"
161166
begin
162167
self.response = JSON.parse(Net::HTTP.get(URI(url)))
163168
rescue StandardError => e
164-
puts "Request for #{url} failed with #{e}"
169+
@logger.error "Request for #{url} failed with #{e}"
165170
self.response = nil
166171
end
167172
end
@@ -170,11 +175,11 @@ def documents
170175
private
171176

172177
def documents_from_urls(urls)
173-
puts "Fetching #{urls.count} documents for page #{page}" if GeoCombine::GeoBlacklightHarvester.config[:debug]
178+
@logger.debug "fetching #{urls.count} documents for page #{page}"
174179
urls.map do |url|
175180
JSON.parse(Net::HTTP.get(URI("#{url}/raw")))
176181
rescue StandardError => e
177-
puts "Fetching \"#{url}/raw\" failed with #{e}"
182+
@logger.error "fetching \"#{url}/raw\" failed with #{e}"
178183

179184
nil
180185
end.compact

lib/geo_combine/harvester.rb

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
require 'find'
55
require 'git'
66
require 'net/http'
7+
require 'geo_combine/logger'
78

89
module GeoCombine
910
# Harvests Geoblacklight documents from OpenGeoMetadata for indexing
@@ -30,26 +31,37 @@ def self.ogm_api_uri
3031

3132
def initialize(
3233
ogm_path: ENV.fetch('OGM_PATH', 'tmp/opengeometadata'),
33-
schema_version: ENV.fetch('SCHEMA_VERSION', '1.0')
34+
schema_version: ENV.fetch('SCHEMA_VERSION', '1.0'),
35+
logger: GeoCombine::Logger.logger
3436
)
3537
@ogm_path = ogm_path
3638
@schema_version = schema_version
39+
@logger = logger
3740
end
3841

3942
# Enumerable of docs to index, for passing to an indexer
4043
def docs_to_index
4144
return to_enum(:docs_to_index) unless block_given?
4245

46+
@logger.info "loading documents from #{ogm_path}"
4347
Find.find(@ogm_path) do |path|
4448
# skip non-json and layers.json files
45-
next unless File.basename(path).include?('.json') && File.basename(path) != 'layers.json'
49+
if File.basename(path) == 'layers.json' || !File.basename(path).end_with?('.json')
50+
@logger.debug "skipping #{path}; not a geoblacklight JSON document"
51+
next
52+
end
4653

4754
doc = JSON.parse(File.read(path))
4855
[doc].flatten.each do |record|
4956
# skip indexing if this record has a different schema version than what we want
5057
record_schema = record['gbl_mdVersion_s'] || record['geoblacklight_version']
51-
next unless record_schema == @schema_version
58+
record_id = record['layer_slug_s'] || record['dc_identifier_s']
59+
if record_schema != @schema_version
60+
@logger.debug "skipping #{record_id}; schema version #{record_schema} doesn't match #{@schema_version}"
61+
next
62+
end
5263

64+
@logger.debug "found record #{record_id} at #{path}"
5365
yield record, path
5466
end
5567
end
@@ -62,40 +74,44 @@ def pull(repo)
6274
clone(repo) unless File.directory? repo_path
6375

6476
Git.open(repo_path).pull
65-
puts "Updated #{repo}"
66-
1
77+
@logger.info "updated #{repo}"
78+
repo
6779
end
6880

6981
# Update all repositories
70-
# Return the count of repositories updated
82+
# Return the names of repositories updated
7183
def pull_all
72-
repositories.map(&method(:pull)).reduce(:+)
84+
updated = repositories.map(&method(:pull)).compact
85+
@logger.info "updated #{updated.size} repositories"
86+
updated
7387
end
7488

7589
# Clone a repository via git
7690
# If the repository already exists, skip it.
7791
def clone(repo)
7892
repo_path = File.join(@ogm_path, repo)
7993
repo_info = repository_info(repo)
94+
repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"
8095

8196
# Skip if exists; warn if archived or empty
8297
if File.directory? repo_path
83-
puts "Skipping clone to #{repo_path}; directory exists"
84-
return 0
98+
@logger.warn "skipping clone to #{repo_path}; directory exists"
99+
return nil
85100
end
86-
puts "WARNING: repository '#{repo}' is archived" if repo_info['archived']
87-
puts "WARNING: repository '#{repo}' is empty" if repo_info['size'].zero?
101+
@logger.warn "repository is archived: #{repo_url}" if repo_info['archived']
102+
@logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero?
88103

89-
repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"
90104
Git.clone(repo_url, nil, path: ogm_path, depth: 1)
91-
puts "Cloned #{repo_url}"
92-
1
105+
@logger.info "cloned #{repo_url} to #{repo_path}"
106+
repo
93107
end
94108

95109
# Clone all repositories via git
96-
# Return the count of repositories cloned.
110+
# Return the names of repositories cloned.
97111
def clone_all
98-
repositories.map(&method(:clone)).reduce(:+)
112+
cloned = repositories.map(&method(:clone)).compact
113+
@logger.info "cloned #{cloned.size} repositories"
114+
cloned
99115
end
100116

101117
private

lib/geo_combine/logger.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# frozen_string_literal: true
2+
3+
require 'logger'
4+
5+
module GeoCombine
6+
# Logger for gem
7+
class Logger
8+
def self.logger
9+
@logger ||= ::Logger.new(
10+
$stderr,
11+
progname: 'GeoCombine',
12+
level: ENV.fetch('LOG_LEVEL', 'info').to_sym
13+
)
14+
end
15+
end
16+
end

lib/tasks/geo_combine.rake

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,20 @@ namespace :geocombine do
1212
desc 'Clone OpenGeoMetadata repositories'
1313
task :clone, [:repo] do |_t, args|
1414
harvester = GeoCombine::Harvester.new
15-
total = args[:repo] ? harvester.clone(args.repo) : harvester.clone_all
16-
puts "Cloned #{total} repositories"
15+
args[:repo] ? harvester.clone(args.repo) : harvester.clone_all
1716
end
1817

1918
desc '"git pull" OpenGeoMetadata repositories'
2019
task :pull, [:repo] do |_t, args|
2120
harvester = GeoCombine::Harvester.new
22-
total = args[:repo] ? harvester.pull(args.repo) : harvester.pull_all
23-
puts "Updated #{total} repositories"
21+
args[:repo] ? harvester.pull(args.repo) : harvester.pull_all
2422
end
2523

2624
desc 'Index all JSON documents except Layers.json'
2725
task :index do
2826
harvester = GeoCombine::Harvester.new
2927
indexer = GeoCombine::Indexer.new
30-
puts "Indexing #{harvester.ogm_path} into #{indexer.solr_url}"
31-
total = indexer.index(harvester.docs_to_index)
32-
puts "Indexed #{total} documents"
28+
indexer.index(harvester.docs_to_index)
3329
end
3430

3531
namespace :geoblacklight_harvester do

spec/lib/geo_combine/geo_blacklight_harvester_spec.rb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
require 'rsolr'
66

77
RSpec.describe GeoCombine::GeoBlacklightHarvester do
8-
subject(:harvester) { described_class.new(site_key) }
8+
subject(:harvester) { described_class.new(site_key, logger:) }
99

10+
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
1011
let(:site_key) { :INSTITUTION }
1112
let(:stub_json_response) { '{}' }
1213
let(:stub_solr_connection) { double('RSolr::Connection') }
@@ -40,7 +41,7 @@
4041

4142
let(:docs) { [{ layer_slug_s: 'abc-123' }, { layer_slug_s: 'abc-321' }] }
4243
let(:stub_json_response) do
43-
{ response: { docs: docs, pages: { current_page: 1, total_pages: 1 } } }.to_json
44+
{ response: { docs:, pages: { current_page: 1, total_pages: 1 } } }.to_json
4445
end
4546

4647
it 'adds documents returned to solr' do
@@ -142,7 +143,7 @@
142143
).and_return(stub_second_response.to_json)
143144
base_url = 'https://example.com?f%5Bdct_provenance_s%5D%5B%5D=INSTITUTION&format=json&per_page=100'
144145
docs = described_class::LegacyBlacklightResponse.new(response: stub_first_response,
145-
base_url: base_url).documents
146+
base_url:).documents
146147

147148
expect(docs.to_a).to eq([first_docs, second_docs])
148149
end
@@ -182,7 +183,7 @@
182183

183184
base_url = 'https://example.com?f%5Bdct_provenance_s%5D%5B%5D=INSTITUTION&format=json&per_page=100'
184185
docs = described_class::ModernBlacklightResponse.new(response: first_results_response,
185-
base_url: base_url).documents
186+
base_url:).documents
186187

187188
expect(docs.to_a).to eq([
188189
[{ 'layer_slug_s' => 'abc-123' }, { 'layer_slug_s' => 'abc-321' }],

spec/lib/geo_combine/harvester_spec.rb

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
require 'spec_helper'
66

77
RSpec.describe GeoCombine::Harvester do
8-
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing') }
8+
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing', logger:) }
99

10+
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
1011
let(:repo_name) { 'my-institution' }
1112
let(:repo_path) { File.join(harvester.ogm_path, repo_name) }
1213
let(:repo_url) { "https://github.com/OpenGeoMetadata/#{repo_name}.git" }
@@ -47,7 +48,7 @@
4748
end
4849

4950
it 'skips records with a different schema version' do
50-
harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark')
51+
harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark', logger:)
5152
expect { |b| harvester.docs_to_index(&b) }.to yield_successive_args(
5253
[JSON.parse(File.read('spec/fixtures/indexing/aardvark.json')), 'spec/fixtures/indexing/aardvark.json']
5354
)
@@ -74,8 +75,8 @@
7475
expect(stub_repo).to have_received(:pull).exactly(2).times
7576
end
7677

77-
it 'returns the count of repositories pulled' do
78-
expect(harvester.pull_all).to eq(2)
78+
it 'returns the names of repositories pulled' do
79+
expect(harvester.pull_all).to eq(%w[my-institution another-institution])
7980
end
8081

8182
it 'skips repositories in the denylist' do
@@ -106,20 +107,6 @@
106107
harvester.clone(repo_name)
107108
expect(Git).not_to have_received(:clone)
108109
end
109-
110-
it 'warns if a repository is empty' do
111-
allow(Net::HTTP).to receive(:get).with('https://api.github.com/repos/opengeometadata/empty').and_return('{"size": 0}')
112-
expect do
113-
harvester.clone('empty')
114-
end.to output(/repository 'empty' is empty/).to_stdout
115-
end
116-
117-
it 'warns if a repository is archived' do
118-
allow(Net::HTTP).to receive(:get).with('https://api.github.com/repos/opengeometadata/empty').and_return('{"archived": true}')
119-
expect do
120-
harvester.clone('outdated-institution')
121-
end.to output(/repository 'outdated-institution' is archived/).to_stdout
122-
end
123110
end
124111

125112
describe '#clone_all' do
@@ -133,8 +120,8 @@
133120
expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/aardvark.git')
134121
end
135122

136-
it 'returns the count of repositories cloned' do
137-
expect(harvester.clone_all).to eq(2)
123+
it 'returns the names of repositories cloned' do
124+
expect(harvester.clone_all).to eq(%w[my-institution another-institution])
138125
end
139126
end
140127

0 commit comments

Comments
 (0)