Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
5af12ee
Refactor Dublin Core ingestion from OAI-PMH ingestor
eilmiv Apr 7, 2026
e36dbc2
Add RSS ingestion for materials and events
eilmiv Apr 7, 2026
be76ff7
Add tests for RSS ingestors
eilmiv Apr 7, 2026
8c2880b
Add ingestors to factory
eilmiv Apr 8, 2026
54895a2
Add support for common extensions
eilmiv Apr 8, 2026
3cea73b
Fix Zeitwerk inflection problem with RSS
eilmiv Apr 8, 2026
2c7c05e
Add support for relative urls
eilmiv Apr 8, 2026
a515e46
Fixes from testing many RSS feeds
eilmiv Apr 8, 2026
cd91db6
Remove start and end date for events based on date published in rss
eilmiv Apr 8, 2026
b8f19c6
Add feed url discovery from youtube url
eilmiv Apr 8, 2026
b2780cf
Fix error class that was too specific
eilmiv Apr 8, 2026
0f042e7
Fix link handling in atom feeds
eilmiv Apr 8, 2026
89e5f53
Use relative import for loading the custom rss media extention
eilmiv Apr 9, 2026
662c450
Add comment for dublin core to text conversion options
eilmiv Apr 9, 2026
0d9556d
Improve error message when there is an unsupported feed type.
eilmiv May 13, 2026
031d9fa
Reuse code from youtube renderer for youtube link detection in RSS in…
eilmiv May 13, 2026
b67ad80
Small refactors in rss ingestion
eilmiv May 21, 2026
02bb1d5
More specific errors in rss ingestion
eilmiv May 21, 2026
0bd85c7
Refactor Yahoo Media RSS namespace patch
eilmiv May 21, 2026
b7196b9
Separate youtube ingestor
eilmiv May 22, 2026
6fe1eff
Remove event rss ingestor
eilmiv May 22, 2026
f8d181a
Test youtube ingestor
eilmiv May 22, 2026
4a23622
Address github copilot comments in rss ingestor implementation
eilmiv May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/initializers/inflections.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,7 @@
# ActiveSupport::Inflector.inflections(:en) do |inflect|
# inflect.acronym "RESTful"
# end

ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym 'RSS'
end
61 changes: 61 additions & 0 deletions config/initializers/rss_media_atom_patch.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
require 'rss'
require 'rss/atom'

# Extension for the Yahoo Media RSS namespace (xmlns:media="http://search.yahoo.com/mrss/").
# Used by feeds that carry rich media metadata, e.g. YouTube channel feeds which include
# <media:group>, <media:title>, and <media:description> elements.

module RSS
module Media
MEDIA_PREFIX = 'media'
MEDIA_URI = 'http://search.yahoo.com/mrss/'

module MediaGroupDescriptionModel
extend BaseModel

def self.append_features(klass)
super
return if klass.instance_of?(Module)

klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI)
klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group')
end
end

BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup')
BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title')
BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description')
end

module Atom
Feed.install_ns(Media::MEDIA_PREFIX, Media::MEDIA_URI)

class Feed
include Media::MediaGroupDescriptionModel

class Entry
include Media::MediaGroupDescriptionModel

class MediaGroup < Element
include RSS09

@tag_name = 'group'

class << self
def required_prefix
Media::MEDIA_PREFIX
end

def required_uri
Media::MEDIA_URI
end
end

install_must_call_validator(Media::MEDIA_PREFIX, Media::MEDIA_URI)
install_text_element('title', Media::MEDIA_URI, '?', 'media_title')
install_text_element('description', Media::MEDIA_URI, '?', 'media_description')
end
end
end
end
end
81 changes: 81 additions & 0 deletions lib/ingestors/dublin_core_ingestion.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
module Ingestors
module DublinCoreIngestion
def build_material_from_dublin_core_data(dc)
material = OpenStruct.new

material.title = dc[:title]
material.description = convert_description(dc[:description])
material.authors = normalize_dublin_core_values(dc[:creators])
material.contributors = normalize_dublin_core_values(dc[:contributors])

rights = normalize_dublin_core_values(dc[:rights])
material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified'

parsed_dates = parse_dublin_core_dates(dc[:dates])
material.date_created = parsed_dates.first
material.date_modified = parsed_dates.last if parsed_dates.size > 1

identifiers = normalize_dublin_core_values(dc[:identifiers])
material.doi = extract_dublin_core_doi(identifiers)
material.url = identifiers.find { |id| id.start_with?('http://', 'https://') }

material.keywords = normalize_dublin_core_values(dc[:subjects])
material.resource_type = normalize_dublin_core_values(dc[:types])
material.contact = dublin_core_text(dc[:publisher])

material
end

def build_event_from_dublin_core_data(dc)
event = OpenStruct.new

event.title = dc[:title]
event.description = convert_description(dc[:description])
event.organizer = normalize_dublin_core_values(dc[:creators]).first
event.contact = dublin_core_text(dc[:publisher]) || event.organizer
event.keywords = normalize_dublin_core_values(dc[:subjects])
event.event_types = normalize_dublin_core_values(dc[:types])

dates = parse_dublin_core_dates(dc[:dates])
event.start = dates.first
event.end = dates.last || dates.first

identifiers = normalize_dublin_core_values(dc[:identifiers])
event.url = identifiers.find { |id| id.start_with?('http://', 'https://') }

event
end

def parse_dublin_core_dates(dates)
normalize_dublin_core_values(dates).map do |date_value|
Date.parse(date_value)
rescue Date::Error, ArgumentError
nil
end.compact
end

def extract_dublin_core_doi(identifiers)
doi = normalize_dublin_core_values(identifiers).find do |id|
id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/')
end
return nil unless doi

normalized = doi.sub(%r{https?://doi\.org/}, '')
"https://doi.org/#{normalized}"
end

def normalize_dublin_core_values(values)
Array(values).map { |v| dublin_core_text(v).to_s.strip }
.reject(&:blank?).uniq
end

# this method is also used by RSS ingestion under an alias
def dublin_core_text(value)
return nil if value.nil?
return value.content if value.respond_to?(:content) # rss gem xml nodes
return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes

value.to_s
end
end
end
2 changes: 2 additions & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def self.ingestors
Ingestors::ZenodoIngestor,
Ingestors::OaiPmhIngestor,
Ingestors::GithubIngestor,
Ingestors::MaterialRSSIngestor,
Ingestors::YoutubeIngestor
] + taxila_ingestors + llm_ingestors + heptraining_ingestors
end

Expand Down
Loading
Loading