diff --git a/config/initializers/inflections.rb b/config/initializers/inflections.rb index 3860f659e..a45df1401 100644 --- a/config/initializers/inflections.rb +++ b/config/initializers/inflections.rb @@ -14,3 +14,7 @@ # ActiveSupport::Inflector.inflections(:en) do |inflect| # inflect.acronym "RESTful" # end + +ActiveSupport::Inflector.inflections(:en) do |inflect| + inflect.acronym 'RSS' +end diff --git a/config/initializers/rss_media_atom_patch.rb b/config/initializers/rss_media_atom_patch.rb new file mode 100644 index 000000000..a345a8243 --- /dev/null +++ b/config/initializers/rss_media_atom_patch.rb @@ -0,0 +1,61 @@ +require 'rss' +require 'rss/atom' + +# Extension for the Yahoo Media RSS namespace (xmlns:media="http://search.yahoo.com/mrss/"). +# Used by feeds that carry rich media metadata, e.g. YouTube channel feeds which include +# , , and elements. + +module RSS + module Media + MEDIA_PREFIX = 'media' + MEDIA_URI = 'http://search.yahoo.com/mrss/' + + module MediaGroupDescriptionModel + extend BaseModel + + def self.append_features(klass) + super + return if klass.instance_of?(Module) + + klass.install_must_call_validator(MEDIA_PREFIX, MEDIA_URI) + klass.install_have_child_element('group', MEDIA_URI, '?', 'media_group') + end + end + + BaseListener.install_class_name(MEDIA_URI, 'group', 'MediaGroup') + BaseListener.install_get_text_element(MEDIA_URI, 'title', 'media_title') + BaseListener.install_get_text_element(MEDIA_URI, 'description', 'media_description') + end + + module Atom + Feed.install_ns(Media::MEDIA_PREFIX, Media::MEDIA_URI) + + class Feed + include Media::MediaGroupDescriptionModel + + class Entry + include Media::MediaGroupDescriptionModel + + class MediaGroup < Element + include RSS09 + + @tag_name = 'group' + + class << self + def required_prefix + Media::MEDIA_PREFIX + end + + def required_uri + Media::MEDIA_URI + end + end + + install_must_call_validator(Media::MEDIA_PREFIX, Media::MEDIA_URI) + install_text_element('title', Media::MEDIA_URI, '?', 'media_title') + install_text_element('description', Media::MEDIA_URI, '?', 'media_description') + end + end + end + end +end diff --git a/lib/ingestors/dublin_core_ingestion.rb b/lib/ingestors/dublin_core_ingestion.rb new file mode 100644 index 000000000..98f390ef7 --- /dev/null +++ b/lib/ingestors/dublin_core_ingestion.rb @@ -0,0 +1,81 @@ +module Ingestors + module DublinCoreIngestion + def build_material_from_dublin_core_data(dc) + material = OpenStruct.new + + material.title = dc[:title] + material.description = convert_description(dc[:description]) + material.authors = normalize_dublin_core_values(dc[:creators]) + material.contributors = normalize_dublin_core_values(dc[:contributors]) + + rights = normalize_dublin_core_values(dc[:rights]) + material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' + + parsed_dates = parse_dublin_core_dates(dc[:dates]) + material.date_created = parsed_dates.first + material.date_modified = parsed_dates.last if parsed_dates.size > 1 + + identifiers = normalize_dublin_core_values(dc[:identifiers]) + material.doi = extract_dublin_core_doi(identifiers) + material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + material.keywords = normalize_dublin_core_values(dc[:subjects]) + material.resource_type = normalize_dublin_core_values(dc[:types]) + material.contact = dublin_core_text(dc[:publisher]) + + material + end + + def build_event_from_dublin_core_data(dc) + event = OpenStruct.new + + event.title = dc[:title] + event.description = convert_description(dc[:description]) + event.organizer = normalize_dublin_core_values(dc[:creators]).first + event.contact = dublin_core_text(dc[:publisher]) || event.organizer + event.keywords = normalize_dublin_core_values(dc[:subjects]) + event.event_types = normalize_dublin_core_values(dc[:types]) + + dates = parse_dublin_core_dates(dc[:dates]) + event.start = dates.first + event.end = dates.last || dates.first + + identifiers = normalize_dublin_core_values(dc[:identifiers]) + event.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + event + end + + def parse_dublin_core_dates(dates) + normalize_dublin_core_values(dates).map do |date_value| + Date.parse(date_value) + rescue Date::Error, ArgumentError + nil + end.compact + end + + def extract_dublin_core_doi(identifiers) + doi = normalize_dublin_core_values(identifiers).find do |id| + id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') + end + return nil unless doi + + normalized = doi.sub(%r{https?://doi\.org/}, '') + "https://doi.org/#{normalized}" + end + + def normalize_dublin_core_values(values) + Array(values).map { |v| dublin_core_text(v).to_s.strip } + .reject(&:blank?).uniq + end + + # this method is also used by RSS ingestion under an alias + def dublin_core_text(value) + return nil if value.nil? + return value.content if value.respond_to?(:content) # rss gem xml nodes + return value.text if value.respond_to?(:text) && !value.is_a?(String) # Nokogiri xml nodes + + value.to_s + end + end +end diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 9bd8169fd..ee3cfad57 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -13,6 +13,8 @@ def self.ingestors Ingestors::ZenodoIngestor, Ingestors::OaiPmhIngestor, Ingestors::GithubIngestor, + Ingestors::MaterialRSSIngestor, + Ingestors::YoutubeIngestor ] + taxila_ingestors + llm_ingestors + heptraining_ingestors end diff --git a/lib/ingestors/material_rss_ingestor.rb b/lib/ingestors/material_rss_ingestor.rb new file mode 100644 index 000000000..5323e53b6 --- /dev/null +++ b/lib/ingestors/material_rss_ingestor.rb @@ -0,0 +1,251 @@ +require 'tess_rdf_extractors' + +module Ingestors + class MaterialRSSIngestor < Ingestor + include DublinCoreIngestion + + def initialize + super + + @bioschemas_manager = BioschemasIngestor.new + end + + def self.config + { + key: 'material_rss', + title: 'RSS / Atom Feed', + category: :materials + } + end + + def read(url) + io = open_url(url) + return if io.nil? + + source_url = url + content = io.read + feed, parse_error_message = parse_feed(content) + + unless feed + discovered_feed_url = discover_feed_url(content, source_url) + if discovered_feed_url.blank? + @messages << parse_error_message + @messages << 'Attempted feed discovery, but no feed URL was found.' + return + end + + io = open_url(discovered_feed_url) + return if io.nil? + + content = io.read + feed, parse_error_message = parse_feed(content) + unless feed + @messages << parse_error_message + return + end + + source_url = discovered_feed_url + end + + if feed.is_a?(RSS::Rss) + @messages << "Parsing RSS feed: #{feed_title(feed)}" + feed.items.each { |item| add_material(build_material_from_rss_item(item, source_url)) } + elsif feed.is_a?(RSS::RDF) + @messages << "Parsing RSS-RDF feed: #{feed_title(feed)}" + rss_materials = feed.items.map { |item| build_material_from_rss_item(item, source_url).to_h } + bioschemas_materials = extract_rdf_bioschemas_materials(content) + merge_with_bioschemas_priority(bioschemas_materials, rss_materials).each do |material| + add_material(material) + end + elsif feed.is_a?(RSS::Atom::Feed) + @messages << "Parsing ATOM feed: #{feed_title(feed)}" + feed.items.each { |item| add_material(build_material_from_atom_item(item, source_url)) } + else + @messages << "Parsing UNKNOWN feed: #{feed_title(feed)}" + @messages << 'unsupported feed format' + end + end + + private + + def parse_feed(content) + feed = RSS::Parser.parse(content, { validate: false, ignore_unknown_element: true }) + return [feed, nil] if feed.present? + + [nil, 'parsing feed failed with: unrecognized feed content'] + rescue RSS::Error => e + [nil, "parsing feed failed with #{e.class}: #{e.message}"] + end + + def discover_feed_url(content, base_url) + doc = Nokogiri::HTML(content) + link = doc.css('link[rel]').find do |node| + rel = node['rel'].to_s.downcase + type = node['type'].to_s.downcase + rel.include?('alternate') && (type.include?('rss') || type.include?('atom')) + end + + href = link&.[]('href') + url = Addressable::URI.join(base_url, href).to_s if href.present? + @messages << "Found RSS/Atom feed link in HTML page, following: #{url}" if url + url + end + + def feed_title(feed) + channel = feed.respond_to?(:channel) ? feed.channel : nil + return channel.title if channel.present? && channel.respond_to?(:title) + return text_value(feed.title) if feed.respond_to?(:title) + + 'Untitled feed' + end + + alias text_value dublin_core_text + + def parse_time(value) + value = value.content if value.respond_to?(:content) + + return value if value.is_a?(Time) || value.is_a?(Date) || value.is_a?(DateTime) + + text = text_value(value) + return nil if text.blank? + + Time.zone.parse(text) + rescue ArgumentError + nil + end + + def extract_dublin_core(item) + { + title: text_value(item.dc_title), + description: text_value(item.dc_description), + creators: Array(item.dc_creators), + contributors: Array(item.dc_contributors), + rights: Array(item.dc_rights_list), + dates: Array(item.dc_dates), + identifiers: Array(item.dc_identifiers), + subjects: Array(item.dc_subjects), + types: Array(item.dc_types), + publisher: item.dc_publisher + } + end + + def extract_atom_link(item) + links = Array(item.links) + + preferred_link = links.find do |link| + href = text_value(link.href) + rel = text_value(link.respond_to?(:rel) ? link.rel : nil).to_s.downcase + + href.present? && (rel.blank? || rel == 'alternate') + end + return text_value(preferred_link.href) if preferred_link.present? + + links.map { |link| text_value(link.href) }.find(&:present?) + end + + def prefer_precise_time(existing_value, candidate_time) + return existing_value if candidate_time.blank? + return candidate_time if existing_value.blank? + + return candidate_time if existing_value.is_a?(Date) && !existing_value.is_a?(DateTime) && existing_value == candidate_time.to_date + + existing_value + end + + def merge_unique(existing_values, new_values) + normalize_dublin_core_values(Array(existing_values) + Array(new_values)) + end + + def merge_with_bioschemas_priority(bioschemas_records, rss_records) + rss_by_url = rss_records.index_by { |record| record[:url].to_s } + + merged = bioschemas_records.map do |bioschemas_record| + key = bioschemas_record[:url].to_s + rss_record = rss_by_url.delete(key) + if rss_record.nil? + bioschemas_record + else + rss_record.merge(bioschemas_record) do |_k, rss_value, bioschemas_value| + bioschemas_value.present? ? bioschemas_value : rss_value + end + end + end + + merged + rss_by_url.values + end + + def extract_rdf_bioschemas_materials(content) + return [] unless content.present? + + materials = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |params| + @bioschemas_manager.convert_params(params) + end + + @bioschemas_manager.deduplicate(materials) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? + @messages << 'An error occurred while extracting Bioschemas LearningResources.' + [] + end + + def build_material_from_rss_item(item, feed_url) + material = build_material_from_dublin_core_data(extract_dublin_core(item)) + + material.title ||= text_value(item.title) + item_link = text_value(item.link) + material.url = Addressable::URI.join(feed_url, item_link).to_s if item_link.present? + itunes_summary = text_value(item.itunes_summary) if item.respond_to?(:itunes_summary) + material.description ||= convert_description(text_value(item.description) || text_value(item.content_encoded) || itunes_summary) + rss_keywords = if item.respond_to?(:categories) + Array(item.categories).map { |c| text_value(c.respond_to?(:content) ? c.content : c) } + else + [] + end + material.keywords = merge_unique(material.keywords, rss_keywords) + author = item.author if item.respond_to?(:author) + itunes_author = item.itunes_author if item.respond_to?(:itunes_author) + material.authors = merge_unique(material.authors, [text_value(author)] + [text_value(itunes_author)].compact) + material.contact ||= material.authors&.first + guid = item.guid if item.respond_to?(:guid) + material.doi ||= extract_dublin_core_doi([text_value(guid)]) + + item_date = parse_time(item.pubDate) if item.respond_to?(:pubDate) + item_date ||= parse_time(item.date) if item.respond_to?(:date) + material.date_published ||= item_date + material.date_created = prefer_precise_time(material.date_created, item_date) + material.date_modified = prefer_precise_time(material.date_modified, parse_time(item.date)) if item.respond_to?(:date) + + material + end + + def build_material_from_atom_item(item, feed_url) + material = build_material_from_dublin_core_data(extract_dublin_core(item)) + + media_title = text_value(item.media_group&.media_title) + material.title ||= text_value(item.title) || media_title + atom_link = text_value(extract_atom_link(item)) + material.url = Addressable::URI.join(feed_url, atom_link).to_s if atom_link.present? + media_group_description = text_value(item.media_group&.media_description) + material.description ||= convert_description(text_value(item.summary) || text_value(item.content) || media_group_description) + atom_keywords = if item.respond_to?(:categories) + Array(item.categories).map { |c| text_value(c.respond_to?(:term) ? c.term : c) } + else + [] + end + atom_authors = Array(item.authors).map { |author| text_value(author.respond_to?(:name) ? author.name : author) } + material.keywords = merge_unique(material.keywords, atom_keywords) + material.authors = merge_unique(material.authors, atom_authors) + material.contact ||= material.authors&.first + material.doi ||= extract_dublin_core_doi([text_value(item.id)]) + + published = parse_time(item.published) + updated = parse_time(item.updated) + material.date_created = prefer_precise_time(material.date_created, published) + material.date_published ||= published || updated + material.date_modified = prefer_precise_time(material.date_modified, updated) + + material + end + end +end diff --git a/lib/ingestors/oai_pmh_ingestor.rb b/lib/ingestors/oai_pmh_ingestor.rb index a4261fe9d..9ce325e67 100644 --- a/lib/ingestors/oai_pmh_ingestor.rb +++ b/lib/ingestors/oai_pmh_ingestor.rb @@ -2,6 +2,8 @@ module Ingestors class OaiPmhIngestor < Ingestor + include DublinCoreIngestion + def self.config { key: 'oai_pmh', @@ -36,20 +38,36 @@ def ns } end + def extract_dublin_core_from_xml(xml_doc) + { + title: xml_doc.at_xpath('//dc:title', ns)&.text, + description: xml_doc.at_xpath('//dc:description', ns)&.text, + creators: xml_doc.xpath('//dc:creator', ns).map(&:text), + contributors: xml_doc.xpath('//dc:contributor', ns).map(&:text), + rights: xml_doc.xpath('//dc:rights', ns).map(&:text), + dates: xml_doc.xpath('//dc:date', ns).map(&:text), + identifiers: xml_doc.xpath('//dc:identifier', ns).map(&:text), + subjects: xml_doc.xpath('//dc:subject', ns).map(&:text), + types: xml_doc.xpath('//dc:type', ns).map(&:text), + publisher: xml_doc.at_xpath('//dc:publisher', ns)&.text + } + end + def read_oai_dublin_core(client) count = 0 client.list_records(metadata_prefix: 'oai_dc').full.each do |record| xml_string = record.metadata.to_s doc = Nokogiri::XML(xml_string) + dc = extract_dublin_core_from_xml(doc) - types = doc.xpath('//dc:type', ns).map(&:text) + types = normalize_dublin_core_values(dc[:types]) # this event detection heuristic captures in particular # - http://purl.org/dc/dcmitype/Event (the standard way of typing an event in dublin core) # - https://schema.org/Event if types.any? { |t| t.downcase.include? 'event' } - read_dublin_core_event(doc) + add_event(build_event_from_dublin_core_data(dc)) else - read_dublin_core_material(doc) + add_material(build_material_from_dublin_core_data(dc)) end count += 1 @@ -57,63 +75,6 @@ def read_oai_dublin_core(client) @messages << "found #{count} records" end - def read_dublin_core_material(xml_doc) - material = OpenStruct.new - material.title = xml_doc.at_xpath('//dc:title', ns)&.text - material.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) - material.authors = xml_doc.xpath('//dc:creator', ns).map(&:text) - material.contributors = xml_doc.xpath('//dc:contributor', ns).map(&:text) - - rights = xml_doc.xpath('//dc:rights', ns).map { |n| n.text&.strip }.reject(&:empty?) - material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' - - dates = xml_doc.xpath('//dc:date', ns).map(&:text) - parsed_dates = dates.map do |d| - Date.parse(d) - rescue StandardError - nil - end.compact - material.date_created = parsed_dates.first - material.date_modified = parsed_dates.last if parsed_dates.size > 1 - - identifiers = xml_doc.xpath('//dc:identifier', ns).map(&:text) - doi = identifiers.find { |id| id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') } - if doi - doi = doi&.sub(%r{https?://doi\.org/}, '') - material.doi = "https://doi.org/#{doi}" - end - material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } - - material.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) - material.resource_type = xml_doc.xpath('//dc:type', ns).map(&:text) - material.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text - - add_material material - end - - def read_dublin_core_event(xml_doc) - event = OpenStruct.new - - event.title = xml_doc.at_xpath('//dc:title', ns)&.text - event.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) - event.url = xml_doc.xpath('//dc:identifier', ns).map(&:text).find { |id| id.start_with?('http://', 'https://') } - event.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text - event.organizer = xml_doc.at_xpath('//dc:creator', ns)&.text - event.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) - event.event_types = xml_doc.xpath('//dc:type', ns).map(&:text) - - dates = xml_doc.xpath('//dc:date', ns).map(&:text) - parsed_dates = dates.map do |d| - Date.parse(d) - rescue StandardError - nil - end.compact - event.start = parsed_dates.first - event.end = parsed_dates.last - - add_event event - end - def read_oai_rdf(client) provider_events = [] provider_materials = [] diff --git a/lib/ingestors/youtube_ingestor.rb b/lib/ingestors/youtube_ingestor.rb new file mode 100644 index 000000000..26f6bf971 --- /dev/null +++ b/lib/ingestors/youtube_ingestor.rb @@ -0,0 +1,33 @@ +module Ingestors + class YoutubeIngestor < MaterialRSSIngestor + require 'cgi' + + def self.config + { + key: 'youtube', + title: 'YouTube', + category: :materials + } + end + + private + + def discover_feed_url(content, base_url) + super_url = super(content, base_url) # discovers url from HTML + return super_url if super_url + + # YouTube does not include feed URL of playlists in HTML + uri = URI.parse(base_url) + return nil unless Renderers::Youtube.is_youtube_url?(base_url) + + playlist_id = CGI.parse(uri.query.to_s).fetch('list', []).first + return nil if playlist_id.blank? + + url = "https://www.youtube.com/feeds/videos.xml?playlist_id=#{CGI.escape(playlist_id)}" + @messages << "Found Atom feed link from YouTube playlist URL, following: #{url}" + url + rescue URI::InvalidURIError + nil + end + end +end diff --git a/lib/renderers/youtube.rb b/lib/renderers/youtube.rb index 2c87dcb88..341162e3d 100644 --- a/lib/renderers/youtube.rb +++ b/lib/renderers/youtube.rb @@ -2,7 +2,7 @@ module Renderers class Youtube VALID_HOSTS = %w[youtube.com youtu.be m.youtube.com www.youtube.com].freeze VALID_SCHEMES = %w[http https].freeze - TEMPLATE = %() + TEMPLATE = %() def initialize(resource) @resource = resource @@ -14,25 +14,23 @@ def can_render? def render_content code = extract_video_code(@resource.url) - (TEMPLATE % { code: code }).html_safe + format(TEMPLATE, code:).html_safe end def extract_video_code(url) - return unless is_youtube_url?(url) + return unless self.class.is_youtube_url?(url) - match = url.match(/[\?\&]v[i]?\=([-_a-zA-Z0-9]+)/) || - url.match(/youtu\.be\/([-_a-zA-Z0-9]+)/) || - url.match(/\/v\/([-_a-zA-Z0-9]+)/) || - url.match(/\/embed\/([-_a-zA-Z0-9]+)/) + match = url.match(/[?&]vi?=([-_a-zA-Z0-9]+)/) || + url.match(%r{youtu\.be/([-_a-zA-Z0-9]+)}) || + url.match(%r{/v/([-_a-zA-Z0-9]+)}) || + url.match(%r{/embed/([-_a-zA-Z0-9]+)}) match[1] if match end - private - - def is_youtube_url?(url) + def self.is_youtube_url?(url) parsed_url = URI.parse(url) VALID_HOSTS.include?(parsed_url.host) && VALID_SCHEMES.include?(parsed_url.scheme) - rescue + rescue StandardError false end end diff --git a/test/fixtures/sources.yml b/test/fixtures/sources.yml index 59edce307..cceaa25a8 100644 --- a/test/fixtures/sources.yml +++ b/test/fixtures/sources.yml @@ -111,3 +111,11 @@ filtered_source: user: regular_user approval_status: 2 +youtube_source: + content_provider: portal_provider + url: 'https://www.youtube.com/feeds/videos.xml?playlist_id=PL123456789' + method: youtube + enabled: true + user: scraper_user + approval_status: 2 + diff --git a/test/unit/ingestors/material_rss_ingestor_test.rb b/test/unit/ingestors/material_rss_ingestor_test.rb new file mode 100644 index 000000000..36b021551 --- /dev/null +++ b/test/unit/ingestors/material_rss_ingestor_test.rb @@ -0,0 +1,597 @@ +require 'test_helper' +require 'stringio' + +class MaterialRSSIngestorTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::MaterialRSSIngestor.new + end + + test 'reads rss items from dublin core and native rss fields' do + rss_feed_xml = <<~XML + + + + RSS material feed + + + Native RSS title + https://example.org/rss/native-link + Native RSS description + native.author@example.org (Native RSS Author) + native-category + 10.9999/native-rss-guid + Tue, 02 Jan 2024 03:04:05 GMT + DC RSS title + DC RSS description + DC Creator One + DC Creator Two + DC Contributor One + DC Contributor Two + plain rights + https://example.org/licenses/rss + 2024-01-01 + 2024-01-10 + https://example.org/rss/dc-url + 10.1234/rss-doi + dc-subject-a + dc-subject-b + dc-type-a + dc-type-b + rss publisher + + + + Plain Rights RSS title + https://example.org/rss/plain-rights + Plain rights RSS description + Plain Rights RSS Creator + plain-only-rights + not-a-date + 2024-01-11 + https://example.org/rss/plain-rights + plain-rights-subject + plain-rights-type + plain rights publisher + + + + Fallback RSS title + https://example.org/rss/fallback + Fallback RSS Author + fallback-category-a + fallback-category-b + 10.5555/fallback-rss-guid + Wed, 03 Jan 2024 04:05:06 GMT + + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 3, @ingestor.materials.count + + dc_material = @ingestor.materials.first + assert_equal 'DC RSS title', dc_material.title + assert_equal 'https://example.org/rss/native-link', dc_material.url + assert_equal 'DC RSS description', dc_material.description + assert_equal ['DC Creator One', 'DC Creator Two', 'native.author@example.org (Native RSS Author)'], dc_material.authors + assert_equal ['DC Contributor One', 'DC Contributor Two'], dc_material.contributors + assert_equal 'https://example.org/licenses/rss', dc_material.licence + assert_equal Date.new(2024, 1, 1), dc_material.date_created + assert_equal Time.utc(2024, 1, 2, 3, 4, 5), dc_material.date_published.utc + assert_equal Date.new(2024, 1, 10), dc_material.date_modified + assert_equal 'https://doi.org/10.1234/rss-doi', dc_material.doi + assert_equal %w[dc-subject-a dc-subject-b native-category], dc_material.keywords + assert_equal %w[dc-type-a dc-type-b], dc_material.resource_type + assert_equal 'rss publisher', dc_material.contact + + plain_rights_material = @ingestor.materials.second + assert_equal 'Plain Rights RSS title', plain_rights_material.title + assert_equal 'https://example.org/rss/plain-rights', plain_rights_material.url + assert_equal 'Plain rights RSS description', plain_rights_material.description + assert_equal ['Plain Rights RSS Creator'], plain_rights_material.authors + assert_equal [], plain_rights_material.contributors + assert_equal 'plain-only-rights', plain_rights_material.licence + assert_equal Date.new(2024, 1, 11), plain_rights_material.date_created + assert_nil plain_rights_material.date_modified + assert_nil plain_rights_material.doi + assert_equal ['plain-rights-subject'], plain_rights_material.keywords + assert_equal ['plain-rights-type'], plain_rights_material.resource_type + assert_equal 'plain rights publisher', plain_rights_material.contact + + fallback_material = @ingestor.materials.third + assert_equal 'Fallback RSS title', fallback_material.title + assert_equal 'https://example.org/rss/fallback', fallback_material.url + assert_equal 'Fallback RSS content encoded', fallback_material.description + assert_equal ['Fallback RSS Author'], fallback_material.authors + assert_equal [], fallback_material.contributors + assert_equal 'notspecified', fallback_material.licence + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_created.utc + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_published.utc + assert_equal Time.utc(2024, 1, 3, 4, 5, 6), fallback_material.date_modified.utc + assert_equal 'https://doi.org/10.5555/fallback-rss-guid', fallback_material.doi + assert_equal %w[fallback-category-a fallback-category-b], fallback_material.keywords + assert_equal [], fallback_material.resource_type + assert_equal 'Fallback RSS Author', fallback_material.contact + end + + test 'reads atom items from dublin core and native atom fields' do + atom_feed_xml = <<~XML + + + Atom material feed + + + Native Atom title + + + Native Atom summary + Native Atom Author + + 10.9999/native-atom-id + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + DC Atom title + DC Atom description + DC Atom Creator One + DC Atom Creator Two + DC Atom Contributor One + plain atom rights + https://example.org/licenses/atom + 2024-02-01 + 2024-02-05 + https://example.org/atom/dc-url + https://doi.org/10.1234/atom-doi + atom-dc-subject + atom-dc-type + atom publisher + + + + Plain Rights Atom title + + Plain rights Atom description + Plain Rights Atom Creator + plain-atom-rights + invalid-date + 2024-02-11 + https://example.org/atom/plain-rights + plain-atom-subject + plain-atom-type + plain atom publisher + + + + Fallback Atom title + + Fallback Atom content + Fallback Atom Author + + + 10.5555/fallback-atom-id + 2024-03-04T05:06:07Z + 2024-03-05T06:07:08Z + + + XML + + read_xml(atom_feed_xml) + + assert_equal 3, @ingestor.materials.count + + dc_material = @ingestor.materials.first + assert_equal 'DC Atom title', dc_material.title + assert_equal 'https://example.org/atom/native-link', dc_material.url + assert_equal 'DC Atom description', dc_material.description + assert_equal ['DC Atom Creator One', 'DC Atom Creator Two', 'Native Atom Author'], dc_material.authors + assert_equal ['DC Atom Contributor One'], dc_material.contributors + assert_equal 'https://example.org/licenses/atom', dc_material.licence + assert_equal Date.new(2024, 2, 1), dc_material.date_created + assert_equal Time.utc(2024, 2, 2, 3, 4, 5), dc_material.date_published.utc + assert_equal Date.new(2024, 2, 5), dc_material.date_modified + assert_equal 'https://doi.org/10.1234/atom-doi', dc_material.doi + assert_equal %w[atom-dc-subject native-atom-category], dc_material.keywords + assert_equal ['atom-dc-type'], dc_material.resource_type + assert_equal 'atom publisher', dc_material.contact + + plain_rights_material = @ingestor.materials.second + assert_equal 'Plain Rights Atom title', plain_rights_material.title + assert_equal 'https://example.org/atom/plain-rights', plain_rights_material.url + assert_equal 'Plain rights Atom description', plain_rights_material.description + assert_equal ['Plain Rights Atom Creator'], plain_rights_material.authors + assert_equal [], plain_rights_material.contributors + assert_equal 'plain-atom-rights', plain_rights_material.licence + assert_equal Date.new(2024, 2, 11), plain_rights_material.date_created + assert_nil plain_rights_material.date_modified + assert_nil plain_rights_material.doi + assert_equal ['plain-atom-subject'], plain_rights_material.keywords + assert_equal ['plain-atom-type'], plain_rights_material.resource_type + assert_equal 'plain atom publisher', plain_rights_material.contact + + fallback_material = @ingestor.materials.third + assert_equal 'Fallback Atom title', fallback_material.title + assert_equal 'https://example.org/atom/fallback', fallback_material.url + assert_equal 'Fallback Atom content', fallback_material.description + assert_equal ['Fallback Atom Author'], fallback_material.authors + assert_equal [], fallback_material.contributors + assert_equal 'notspecified', fallback_material.licence + assert_equal Time.utc(2024, 3, 4, 5, 6, 7), fallback_material.date_created.utc + assert_equal Time.utc(2024, 3, 4, 5, 6, 7), fallback_material.date_published.utc + assert_equal Time.utc(2024, 3, 5, 6, 7, 8), fallback_material.date_modified.utc + assert_equal 'https://doi.org/10.5555/fallback-atom-id', fallback_material.doi + assert_equal %w[fallback-atom-category-a fallback-atom-category-b], fallback_material.keywords + assert_equal [], fallback_material.resource_type + assert_equal 'Fallback Atom Author', fallback_material.contact + end + + test 'logs parse error for invalid feed input' do + read_xml('not valid rss or atom') + + assert_equal 2, @ingestor.messages.length + assert_match(/^parsing feed failed with RSS::NotWellFormedError: This is not well formed XML/, @ingestor.messages.first) + assert_equal 'Attempted feed discovery, but no feed URL was found.', + @ingestor.messages.second + assert_empty @ingestor.materials + end + + test 'reads rss 0.91 feed' do + rss_091_feed_xml = <<~XML + + + + RSS 0.91 feed + https://example.org/rss091 + desc + + RSS 0.91 title + https://example.org/rss091/item + RSS 0.91 description + + + + XML + + read_xml(rss_091_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 0.91 title', material.title + assert_equal 'https://example.org/rss091/item', material.url + assert_equal 'RSS 0.91 description', material.description + assert_equal [], material.keywords + assert_equal 'notspecified', material.licence + assert_nil material.doi + assert_nil material.contact + end + + test 'reads rss 1.0 feed' do + rss_10_feed_xml = <<~XML + + + + RSS 1.0 feed + https://example.org/rss10 + desc + + + + + + + + RSS 1.0 title + https://example.org/rss10/item + RSS 1.0 description + RSS 1.0 Creator + rss10-subject + 10.1111/rss10doi + 2024-04-01 + + + XML + + read_xml(rss_10_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 1.0 title', material.title + assert_equal 'https://example.org/rss10/item', material.url + assert_equal 'RSS 1.0 description', material.description + assert_equal ['RSS 1.0 Creator'], material.authors + assert_equal ['rss10-subject'], material.keywords + assert_equal 'https://doi.org/10.1111/rss10doi', material.doi + assert_equal Date.new(2024, 4, 1), material.date_created.to_date + assert_equal Date.new(2024, 4, 1), material.date_modified.to_date + end + + test 'reads bioschemas learning resource from rss 1.0 rdf feed' do + rss_10_bioschemas_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas feed + https://example.org/rss10-bioschemas + desc + + + + + + + + Fallback RSS 1.0 title + https://example.org/rss10-bioschemas/item + Fallback RSS 1.0 description + + + + + + + RSS 1.0 Bioschemas title + + + + + XML + + read_xml(rss_10_bioschemas_feed_xml) + + assert_equal 2, @ingestor.materials.count + + material = @ingestor.materials.detect { |m| m.url == 'https://example.org/rss10/bioschemas/material' } + refute_nil material + assert_equal 'RSS 1.0 Bioschemas title', material.title + assert_equal 'https://example.org/rss10/bioschemas/material', material.url + assert_equal 'https://opensource.org/licenses/MIT', material.licence + + fallback_material = @ingestor.materials.detect { |m| m.url == 'https://example.org/rss10-bioschemas/item' } + refute_nil fallback_material + assert_equal 'Fallback RSS 1.0 title', fallback_material.title + end + + test 'merges rss properties into bioschemas material for same url with bioschemas priority' do + rss_10_bioschemas_merged_feed_xml = <<~XML + + + + RSS 1.0 Bioschemas merged feed + https://example.org/rss10-merged + desc + + + + + + + + + RSS 1.0 fallback title + https://example.org/rss10/merged/material + RSS 1.0 fallback description that should fill missing bioschemas value + RSS 1.0 Merged Creator + rss10-merged-subject + 2024-05-01 + + + + + + + RSS 1.0 Bioschemas preferred title + + + + + XML + + read_xml(rss_10_bioschemas_merged_feed_xml) + + assert_equal 1, @ingestor.materials.count + + material = @ingestor.materials.first + assert_equal 'RSS 1.0 Bioschemas preferred title', material.title + assert_equal 'https://example.org/rss10/merged/material', material.url + assert_equal 'https://opensource.org/licenses/Apache-2.0', material.licence + assert_equal 'RSS 1.0 fallback description that should fill missing bioschemas value', material.description + assert_equal ['rss10-merged-subject'], material.keywords + assert_equal ['RSS 1.0 Merged Creator'], material.authors + assert_equal Date.new(2024, 5, 1), material.date_created.to_date + assert_equal Date.new(2024, 5, 1), material.date_modified.to_date + end + + test 'reads feed from html alternate meta link' do + start_url = 'https://www.youtube.com/@example' + feed_url = 'https://www.youtube.com/feeds/videos.xml?channel_id=UC123456789' + + html_with_alternate_feed_link = <<~HTML + + + + Channel + + + Channel page + + HTML + + atom_feed_xml = <<~XML + + + Minimal Atom material feed + + Alternate feed material + + Minimal content used for alternate-link test + Alternate Feed Author + 2024-02-02T03:04:05Z + + + XML + + read_xml_map( + { + start_url => html_with_alternate_feed_link, + feed_url => atom_feed_xml + }, + start_url + ) + + assert_equal 1, @ingestor.materials.count + assert_includes @ingestor.messages, + "Found RSS/Atom feed link in HTML page, following: #{feed_url}" + assert_equal 'Alternate feed material', @ingestor.materials.first.title + end + + test 'uses native atom title and description taking precedence over media extension' do + atom_feed_xml = <<~XML + + + Atom media precedence feed + + + yt:video:abc123 + Native Atom title wins + + Native Atom summary wins + Atom Author + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + + Media title ignored + Media description ignored + + + + XML + + read_xml(atom_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'Native Atom title wins', material.title + assert_equal 'Native Atom summary wins', material.description + end + + test 'uses media extension title and description for atom item when native ones are missing' do + atom_feed_xml = <<~XML + + + Atom media extension feed + + + yt:video:fallback123 + + Atom Author + 2024-02-02T03:04:05Z + 2024-02-03T03:04:05Z + + Media title used here + Media description used here + + + + XML + + read_xml(atom_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'Media title used here', material.title + assert_equal 'Media description used here', material.description + end + + test 'parses media group description through rss media extension' do + atom_feed_xml = <<~XML + + + Media extension feed + urn:feed:test + 2024-01-01T00:00:00Z + + + urn:entry:test + Media extension title + + 2024-01-01T00:00:00Z + + Media extension description + + + + XML + + feed = RSS::Parser.parse(atom_feed_xml, validate: false, ignore_unknown_element: true) + item = feed.items.first + + assert item.respond_to?(:media_group) + assert_equal 'Media extension description', item.media_group.media_description + end + + test 'uses itunes extension summary for rss item when native description is missing' do + rss_feed_xml = <<~XML + + + + RSS iTunes extension feed + + RSS item with iTunes summary + https://example.org/rss/itunes-summary + RSS Author + Fri, 02 Feb 2024 03:04:05 GMT + iTunes summary used here + iTunes Author + + + + XML + + read_xml(rss_feed_xml) + + assert_equal 1, @ingestor.materials.count + material = @ingestor.materials.first + assert_equal 'RSS item with iTunes summary', material.title + assert_equal 'iTunes summary used here', material.description + assert_includes material.authors, 'RSS Author' + assert_includes material.authors, 'iTunes Author' + end + + private + + def read_xml(xml, url = 'https://example.org/feed.xml') + @ingestor.stub(:open_url, StringIO.new(xml)) do + @ingestor.read(url) + end + end + + def read_xml_map(url_to_content, start_url) + @ingestor.stub(:open_url, lambda do |requested_url| + content = url_to_content[requested_url] + content.nil? ? nil : StringIO.new(content) + end) do + @ingestor.read(start_url) + end + end +end diff --git a/test/unit/ingestors/youtube_ingestor_test.rb b/test/unit/ingestors/youtube_ingestor_test.rb new file mode 100644 index 000000000..9739ce70c --- /dev/null +++ b/test/unit/ingestors/youtube_ingestor_test.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'test_helper' + +class YoutubeIngestorTest < ActiveSupport::TestCase + test 'discovers a YouTube playlist feed URL' do + ingestor = Ingestors::YoutubeIngestor.new + base_url = 'https://www.youtube.com/watch?v=abc123&list=PL123456789' + expected = 'https://www.youtube.com/feeds/videos.xml?playlist_id=PL123456789' + + assert_equal expected, ingestor.send(:discover_feed_url, '', base_url) + assert_equal ["Found Atom feed link from YouTube playlist URL, following: #{expected}"], ingestor.messages + end + + test 'scrapes a YouTube playlist source end to end' do + user = users(:scraper_user) + provider = content_providers(:portal_provider) + source = sources(:youtube_source) + scraper = Scraper.new({ username: user.username, sources: [] }) + + feed_url = source.url + + WebMock.stub_request(:get, feed_url).to_return( + body: <<~XML + + + YouTube feed + + Video title + + Video summary + tag:youtube.com,2008:video:abc123 + 2024-01-01T00:00:00Z + 2024-01-01T00:00:00Z + + + XML + ) + + with_settings(user_ingestion_methods: ['youtube']) do + assert_difference('provider.materials.count', 1) do + scraper.scrape(source, user) + end + end + + material = provider.materials.find_by(url: 'https://www.youtube.com/watch?v=abc123') + assert material + assert_equal 'Video title', material.title + + source.reload + assert_equal 1, source.records_read + assert_equal 1, source.records_written + end +end