Skip to content

Commit 6d0b861

Browse files
committed
Move Solrmarc conversion into custom Solr backend
As we're working on additional indexes fed by Solrmarc (for Blacklight), it started making more sense to move the Solrmarc conversion process into a lower-level class so that we can have more uniformity at the index and exporter levels. This makes that change. It also means we can remove some cruft from the solr/solrmarc project directory.
1 parent c401ad4 commit 6d0b861

6 files changed

Lines changed: 297 additions & 67 deletions

File tree

django/sierra/base/search_indexes.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from haystack import indexes, constants, utils, exceptions
1616

17+
from django.conf import settings
1718
from django.core.exceptions import ObjectDoesNotExist
1819

1920
from export import sierra2marc as s2m
@@ -200,7 +201,20 @@ def full_prepare(self, obj):
200201
return self.prepared_data
201202

202203

203-
class BibIndex(CustomQuerySetIndex, indexes.Indexable):
204+
class SolrmarcIndex(CustomQuerySetIndex):
205+
"""
206+
Extends `CustomQuerySetIndex` with a few class attributes for the
207+
SolrmarcIndexBackend, for indexing data via Solrmarc. See the
208+
`sierra.solr_backend.SolrmarcIndexBackend` docstring for more info.
209+
"""
210+
211+
s2marc_class = s2m.S2MarcBatch
212+
index_properties = None
213+
config_file = None
214+
temp_filedir = None
215+
216+
217+
class BibIndex(SolrmarcIndex, indexes.Indexable):
204218
"""
205219
WARNING: This is a total hack to force Haystack to register our
206220
BibRecord model as being indexed by Haystack. This is the only way

django/sierra/export/basic_exporters.py

Lines changed: 30 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,13 @@
88
"""
99
from __future__ import unicode_literals
1010
import logging
11-
import re
12-
import subprocess
13-
import os
14-
from collections import OrderedDict
1511

1612
from django.conf import settings
1713

1814
from base import models as sierra_models
1915
from base import search_indexes as indexes
20-
from export import models as export_models
2116
from export.exporter import (Exporter, ToSolrExporter, MetadataToSolrExporter,
2217
CompoundMixin, AttachedRecordExporter)
23-
from export.sierra2marc import S2MarcError, S2MarcBatch
2418
from utils import helpers, redisobjs, solr
2519

2620
# set up logger, for debugging
@@ -350,7 +344,13 @@ def final_callback(self, vals=None, status='success'):
350344

351345
class BibsDownloadMarc(Exporter):
352346
"""
353-
Defines processes that convert Sierra bib records to MARC.
347+
This exporter is now deprecated--please do not use.
348+
349+
Previously this defined processes that convert Sierra bib records
350+
to MARC, but now that is handled through a custom Solr backend for
351+
Haystack.
352+
353+
`BibsDownloadMarc` will be removed in the version 1.5.
354354
"""
355355
max_rec_chunk = 2000
356356
parallel = False
@@ -369,6 +369,15 @@ class BibsDownloadMarc(Exporter):
369369
]
370370
select_related = ['record_metadata', 'record_metadata__record_type']
371371

372+
def _warn(self):
373+
msg = ('The `BibsDownloadMarc` exporter is deprecated and will be '
374+
'removed in version 1.5.')
375+
self.log('Warning', msg)
376+
377+
def __init__(self, *args, **kwargs):
378+
super(BibsDownloadMarc, self).__init__(*args, **kwargs)
379+
self._warn()
380+
372381
def export_records(self, records):
373382
batch = S2MarcBatch(records)
374383
out_recs = batch.to_marc()
@@ -382,17 +391,15 @@ def export_records(self, records):
382391
return { 'marcfile': filename }
383392

384393

385-
class BibsToSolr(CompoundMixin, ToSolrExporter):
394+
class BibsToSolr(ToSolrExporter):
386395
"""
387396
Defines processes that export Sierra/MARC bibs out to Solr.
388397
"""
389398
Index = ToSolrExporter.Index
390-
Child = CompoundMixin.Child
391399
index_config = (
392400
Index('Bibs', indexes.BibIndex, SOLR_CONNS['BibsToSolr:BIBS']),
393401
Index('MARC', indexes.MarcIndex, SOLR_CONNS['BibsToSolr:MARC'])
394402
)
395-
children_config = (Child('BibsDownloadMarc'),)
396403
model = sierra_models.BibRecord
397404
deletion_filter = [
398405
{
@@ -401,57 +408,19 @@ class BibsToSolr(CompoundMixin, ToSolrExporter):
401408
}
402409
]
403410
max_rec_chunk = 2000
404-
405-
@property
406-
def prefetch_related(self):
407-
return self.children['BibsDownloadMarc'].prefetch_related
408-
409-
@property
410-
def select_related(self):
411-
return self.children['BibsDownloadMarc'].select_related
412-
413-
def export_records(self, records):
414-
cmd = 'bash'
415-
index_script = settings.SOLRMARC_COMMAND
416-
config_file = settings.SOLRMARC_CONFIG_FILE
417-
filedir = settings.MEDIA_ROOT
418-
if filedir[-1] != '/':
419-
filedir = '{}/'.format(filedir)
420-
bib_converter = self.children['BibsDownloadMarc']
421-
converter_vals = bib_converter.export_records(records)
422-
filename = converter_vals['marcfile']
423-
filepath = '{}{}'.format(filedir, filename)
424-
try:
425-
output = subprocess.check_output([cmd, index_script, config_file,
426-
filepath],
427-
stderr=subprocess.STDOUT,
428-
shell=False,
429-
universal_newlines=True)
430-
output = output.decode('unicode-escape')
431-
except subprocess.CalledProcessError as e:
432-
error_lines = e.output.split("\n")
433-
for line in error_lines:
434-
self.log('Error', line)
435-
self.log('Error', 'Solrmarc process did not run successfully.')
436-
else:
437-
error_lines = output.split("\n")
438-
del(error_lines[-1])
439-
if error_lines:
440-
for line in error_lines:
441-
line = re.sub(r'^\s+', '', line)
442-
if re.match(r'^WARN', line):
443-
self.log('Warning', line)
444-
elif re.match(r'^ERROR', line):
445-
self.log('Warning', line)
446-
447-
# if all went well, we now try to index the MARC record
448-
try:
449-
self.indexes['MARC'].do_update(records)
450-
except Exception as e:
451-
self.log_error(e)
452-
453-
# delete the file when we're done so we don't take up space
454-
os.remove(filepath)
411+
prefetch_related = [
412+
'record_metadata__varfield_set',
413+
'bibrecorditemrecordlink_set',
414+
'bibrecorditemrecordlink_set__item_record',
415+
'bibrecorditemrecordlink_set__item_record__record_metadata',
416+
'bibrecorditemrecordlink_set__item_record__record_metadata'
417+
'__record_type',
418+
'bibrecordproperty_set',
419+
'bibrecordproperty_set__material__materialpropertyname_set',
420+
'bibrecordproperty_set__material__materialpropertyname_set'
421+
'__iii_language',
422+
]
423+
select_related = ['record_metadata', 'record_metadata__record_type']
455424

456425

457426
class ItemsBibsToSolr(AttachedRecordExporter):

django/sierra/export/exporter.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,14 +400,20 @@ def indexes(self):
400400
self._indexes = type(self).spawn_indexes(self.export_type)
401401
return self._indexes
402402

403+
def handle_error(self, obj_str, error):
404+
if obj_str == 'ERROR':
405+
raise error
406+
obj_info = '' if obj_str == 'WARNING' else '{} '.format(obj_str)
407+
msg = '{} update skipped due to error: {}'.format(obj_info, error)
408+
self.log('Warning', msg)
409+
403410
def export_records(self, records):
404411
for index in self.indexes.values():
405412
index.do_update(records)
406413

407414
for index in self.indexes.values():
408415
for obj_str, e in index.last_batch_errors:
409-
msg = '{} update skipped due to error: {}'.format(obj_str, e)
410-
self.log('Warning', msg)
416+
self.handle_error(obj_str, e)
411417

412418
def delete_records(self, records):
413419
for index in self.indexes.values():

django/sierra/sierra/settings/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,15 +291,15 @@ def raise_setting_error(setting):
291291
'TIMEOUT': 60 * 20,
292292
},
293293
'bibdata': {
294-
'ENGINE': 'sierra.solr_backend.CustomSolrEngine',
294+
'ENGINE': 'sierra.solr_backend.SolrmarcEngine',
295295
'URL': solr_bibdata_url,
296296
'TIMEOUT': 60 * 20,
297297
},
298298
'marc': {
299299
'ENGINE': 'sierra.solr_backend.CustomSolrEngine',
300300
'URL': solr_marc_url,
301301
'TIMEOUT': 60 * 20,
302-
},
302+
}
303303
}
304304

305305
# HAYSTACK_LIMIT_TO_REGISTERED_MODELS, set to False to allow Haystack

django/sierra/sierra/solr_backend.py

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,13 @@
1414
2. Overrides the SolrSearchBackend.clear() method so that the Solr
1515
index optimization isn't triggered if commit is false.
1616
"""
17+
import subprocess
18+
import os
19+
import shlex
20+
import re
1721

1822
from django.apps import apps
23+
from django.conf import settings
1924

2025
from haystack.backends import solr_backend, BaseEngine
2126
from haystack.models import SearchResult
@@ -155,4 +160,110 @@ def clear(self, models=[], commit=True):
155160

156161
class CustomSolrEngine(BaseEngine):
157162
backend = CustomSolrSearchBackend
158-
query = solr_backend.SolrSearchQuery
163+
query = solr_backend.SolrSearchQuery
164+
165+
166+
class SolrmarcIndexBackend(CustomSolrSearchBackend):
167+
"""
168+
This is a custom Solr backend class for Haystack(ish) indexes that
169+
implements doing index updates via Solrmarc. All of the code here
170+
is derived from the code that was part of the `BibsDownloadMarc`
171+
`BibsToSolr` exporters (in `export.basic_exporters`). As we're
172+
working on additional indexes fed by Solrmarc (for Blacklight),
173+
it started to make more sense to move that into a lower-level
174+
class for more uniformity at the index and exporter levels.
175+
176+
How to use this class? In Django settings, use the SolrmarcEngine
177+
class in your HAYSTACK_CONNECTIONS definition. Ensure that you've
178+
created the applicable Solr core and that you have an
179+
index.properties file in the solr/solrmarc project directory for
180+
that index. (By default you should name it <core>_index.properties,
181+
where <core> is the name of the Solr core.) Your haystack index
182+
class should be a `base.search_indexes.CustomQuerySetIndex` or
183+
`SolrmarcIndex` class. There are a few class attributes you can add
184+
to the index class to help further define how the SolrMarc process
185+
works--without them, sensible defaults are used.
186+
187+
`s2marc_class` -- The S2MarcBatch (see `export.sierra2marc`) or
188+
equivalent/derived class that does the batch conversion of Sierra
189+
data (via the Django ORM models) to MARC records and saves them to
190+
the filesystem so that Solrmarc can index them. Default is
191+
S2MarcBatch.
192+
193+
`index_properties` -- The filename for the index.properties file
194+
that converts the MARC files to Solr fields. As mentioned above,
195+
the default is '<core>_index.propertes' -- where <core> is the name
196+
of the Solr core for that index.
197+
198+
`config_file` -- The filename for the Solrmarc config.properties
199+
file that defines a bunch of settings used by Solrmarc. Default is
200+
the SOLRMARC_CONFIG_FILE Django setting.
201+
202+
`temp_filepath` -- The filesystem location where the temporary MARC
203+
file that gets loaded into Solrmarc is stored. Default is the
204+
MEDIA_ROOT Django setting.
205+
"""
206+
207+
class IndexError(Exception):
208+
pass
209+
210+
def log_error(self, index, obj_str, err):
211+
err = err if isinstance(err, Exception) else self.IndexError(err)
212+
index.last_batch_errors.append((obj_str, err))
213+
214+
def _records_to_marcfile(self, index, records):
215+
batch = index.s2marc_class(records)
216+
out_recs = batch.to_marc()
217+
try:
218+
filename = batch.to_file(out_recs, append=False)
219+
except IOError as e:
220+
raise IOError('Error writing to output file: {}'.format(e))
221+
for e in batch.errors:
222+
self.log_error(index, e.id, e.msg)
223+
return filename
224+
225+
def _formulate_solrmarc_cmd(self, index, rec_filepath, commit):
226+
def_ip = '{}_index.properties'.format(self.get_core_name())
227+
index_properties = getattr(index, 'index_properties', None) or def_ip
228+
def_config = settings.SOLRMARC_CONFIG_FILE
229+
config_file = getattr(index, 'config_file', None) or def_config
230+
commit_str = 'true' if commit else 'false'
231+
jarfile = ('{}/../../solr/solrmarc/StanfordSearchWorksSolrMarc.jar'
232+
''.format(settings.PROJECT_DIR))
233+
return ('java -Xmx1g -Dsolr.hosturl="{}" '
234+
'-Dsolrmarc.indexing.properties="{}" '
235+
'-Dsolr.commit_at_end="{}" '
236+
'-jar "{}" {} {}'
237+
''.format(self.conn.url, index_properties, commit_str, jarfile,
238+
config_file, rec_filepath))
239+
240+
def get_core_name(self):
241+
return self.conn.url.split('/')[-1]
242+
243+
def update(self, index, records, commit=False):
244+
filedir = getattr(index, 'temp_filedir', None) or settings.MEDIA_ROOT
245+
if not filedir.endswith('/'):
246+
filedir = '{}/'.format(filedir)
247+
rec_filename = self._records_to_marcfile(index, records)
248+
rec_filepath = '{}{}'.format(filedir, rec_filename)
249+
cmd = self._formulate_solrmarc_cmd(index, rec_filepath, commit)
250+
call_options = {'stderr': subprocess.STDOUT, 'shell': False,
251+
'universal_newlines': True}
252+
try:
253+
result = subprocess.check_output(shlex.split(cmd), **call_options)
254+
output = result.decode('unicode-escape')
255+
except subprocess.CalledProcessError as e:
256+
msg = ('Solrmarc process did not run successfully: {}'
257+
''.format(e.output))
258+
self.log_error(index, 'ERROR', msg)
259+
else:
260+
for line in output.split("\n")[:-1]:
261+
line = re.sub(r'^\s+', '', line)
262+
if re.match(r'^(WARN|ERROR)', line):
263+
self.log_error(index, 'WARNING', line)
264+
os.remove(rec_filepath)
265+
266+
267+
class SolrmarcEngine(BaseEngine):
268+
backend = SolrmarcIndexBackend
269+
query = solr_backend.SolrSearchQuery

0 commit comments

Comments
 (0)