X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=data%2Frbot%2Fplugins%2Frss.rb;h=bfc700d3e90a6738b3282e3f662c89fc1dd5c0f6;hb=783ffa4235330029d661752b1023db635b26f2b3;hp=3b8e8c7dcc51c8a2cda6a4df493eae357b8cd470;hpb=6f9bfa43ac907700fcba394e0f6b9d987b1192fb;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/data/rbot/plugins/rss.rb b/data/rbot/plugins/rss.rb index 3b8e8c7d..bfc700d3 100644 --- a/data/rbot/plugins/rss.rb +++ b/data/rbot/plugins/rss.rb @@ -16,7 +16,7 @@ require 'rss' -# Try to load rss/content/2.0 so we can access the data in +# Try to load rss/content/2.0 so we can access the data in # tags. begin require 'rss/content/2.0' @@ -25,21 +25,6 @@ end module ::RSS - # Make an 'unique' ID for a given item, based on appropriate bot options - # Currently only suppored is bot.config['rss.show_updated']: when true, the - # description is included in the uid hashing, otherwise it's not - # - def RSS.item_uid_for_bot(item, opts={}) - options = { :show_updated => true}.merge(opts) - desc = nil - if options[:show_updated] - desc = item.content.content rescue item.description rescue nil - end - [(item.title.content rescue item.title rescue nil), - (item.link.href rescue item.link), - desc].hash - end - # Add support for Slashdot namespace in RDF. The code is just an adaptation # of the DublinCore code. unless defined?(SLASH_PREFIX) @@ -165,12 +150,42 @@ module ::RSS SlashModel::ELEMENTS.collect! {|name| "#{SLASH_PREFIX}_#{name}"} end + + class Element + class << self + def def_bang(name, chain) + class_eval %< + def #{name}! + blank2nil { #{chain.join(' rescue ')} rescue nil } + end + >, *get_file_and_line_from_caller(0) + end + end + + { + :link => %w{link.href link}, + :guid => %w{guid.content guid}, + :content => %w{content.content content}, + :description => %w{description.content description}, + :title => %w{title.content title}, + :category => %w{category.content category}, + :dc_subject => %w{dc_subject}, + :author => %w{author.name.content author.name author}, + :dc_creator => %w{dc_creator} + }.each { |name, chain| def_bang name, chain } + + protected + def blank2nil(&block) + x = yield + (x && !x.empty?) ? x : nil + end + end end class ::RssBlob attr_accessor :url, :handle, :type, :refresh_rate, :xml, :title, :items, - :mutex, :watchers, :last_fetched + :mutex, :watchers, :last_fetched, :http_cache, :last_success def initialize(url,handle=nil,type=nil,watchers=[], xml=nil, lf = nil) @url = url @@ -182,11 +197,13 @@ class ::RssBlob @type = type @watchers=[] @refresh_rate = nil + @http_cache = false @xml = xml @title = nil @items = nil @mutex = Mutex.new @last_fetched = lf + @last_success = nil sanitize_watchers(watchers) end @@ -261,6 +278,10 @@ class RSSFeedsPlugin < Plugin :default => 300, :validate => Proc.new{|v| v > 30}, :desc => "How many seconds to sleep before checking RSS feeds again") + Config.register Config::IntegerValue.new('rss.announce_timeout', + :default => 0, + :desc => "Don't announce watched feed if these many seconds elapsed since the last successful update") + Config.register Config::BooleanValue.new('rss.show_updated', :default => true, :desc => "Whether feed items for which the description was changed should be shown as new") @@ -269,8 +290,23 @@ class RSSFeedsPlugin < Plugin :default => true, :desc => "Whether to display links from the text of a feed item.") + # Make an 'unique' ID for a given item, based on appropriate bot options + # Currently only suppored is bot.config['rss.show_updated']: when false, + # only the guid/link is accounted for. + + def make_uid(item) + uid = [item.guid! || item.link!] + if @bot.config['rss.show_updated'] + uid.push(item.content! || item.description!) + uid.unshift item.title! + end + # debug "taking hash of #{uid.inspect}" + uid.hash + end + + # We used to save the Mutex with the RssBlob, which was idiotic. And - # since Mutexes dumped in one version might not be resotrable in another, + # since Mutexes dumped in one version might not be restorable in another, # we need a few tricks to be able to restore data from other versions of Ruby # # When migrating 1.8.6 => 1.8.5, all we need to do is define an empty @@ -284,10 +320,125 @@ class RSSFeedsPlugin < Plugin end end + # Auxiliary method used to collect two lines for rss output filters, + # running substitutions against DataStream _s_ optionally joined + # with hash _h_ + def make_stream(line1, line2, s, h={}) + ss = s.merge(h) + DataStream.new([line1, line2].compact.join("\n") % ss, ss) + end + + # Define default RSS filters + # + # TODO: load personal ones + def define_filters + @outkey = :"rss.out" + @bot.register_filter(:headlines, @outkey) { |s| + line1 = (s[:handle].empty? ? "%{date}" : "%{handle}") << "%{title}" + make_stream(line1, nil, s) + } + @bot.register_filter(:blog, @outkey) { |s| + author = s[:author] ? (s[:author] + " ") : "" + abt = s[:category] ? "about #{s[:category]} " : "" + line1 = "%{handle}%{date}%{author}blogged %{abt}at %{link}" + line2 = "%{handle}%{title} - %{desc}" + make_stream(line1, line2, s, :author => author, :abt => abt) + } + @bot.register_filter(:photoblog, @outkey) { |s| + author = s[:author] ? (s[:author] + " ") : "" + abt = s[:category] ? "under #{s[:category]} " : "" + line1 = "%{handle}%{date}%{author}added an image %{abt}at %{link}" + line2 = "%{handle}%{title} - %{desc}" + make_stream(line1, line2, s, :author => author, :abt => abt) + } + @bot.register_filter(:news, @outkey) { |s| + line1 = "%{handle}%{date}%{title}%{at}%{link}" % s + line2 = "%{handle}%{date}%{desc}" % s + make_stream(line1, line2, s) + } + @bot.register_filter(:git, @outkey) { |s| + author = s[:author].sub(/@\S+?\s*>/, "@...>") + " " if s[:author] + line1 = "%{handle}%{date}%{author}committed %{title}%{at}%{link}" + make_stream(line1, nil, s, :author => author) + } + @bot.register_filter(:forum, @outkey) { |s| + line1 = "%{handle}%{date}%{title}%{at}%{link}" + make_stream(line1, nil, s) + } + @bot.register_filter(:wiki, @outkey) { |s| + line1 = "%{handle}%{date}%{title}%{at}%{link}" + line1 << "has been edited by %{author}. %{desc}" + make_stream(line1, nil, s) + } + @bot.register_filter(:gmane, @outkey) { |s| + line1 = "%{handle}%{date}Message %{title} sent by %{author}. %{desc}" + make_stream(line1, nil, s) + } + @bot.register_filter(:trac, @outkey) { |s| + author = s[:author].sub(/@\S+?\s*>/, "@...>") + ": " if s[:author] + line1 = "%{handle}%{date}%{author}%{title}%{at}%{link}" + line2 = nil + unless s[:item].title =~ /^(?:Changeset \[(?:[\da-f]+)\]|\(git commit\))/ + line2 = "%{handle}%{date}%{desc}" + end + make_stream(line1, line2, s, :author => author) + } + @bot.register_filter(:"/.", @outkey) { |s| + dept = "(from the #{s[:item].slash_department} dept) " rescue nil + sec = " in section #{s[:item].slash_section}" rescue nil + line1 = "%{handle}%{date}%{dept}%{title}%{at}%{link} " + line1 << "(posted by %{author}%{sec})" + make_stream(line1, nil, s, :dept => dept, :sec => sec) + } + @bot.register_filter(:default, @outkey) { |s| + line1 = "%{handle}%{date}%{title}%{at}%{link}" + line1 << " (by %{author})" if s[:author] + make_stream(line1, nil, s) + } + + # Define an HTML info filter too + @bot.register_filter(:rss, :htmlinfo) { |s| htmlinfo_filter(s) } + + # This is the output format used by the input filter + @bot.register_filter(:htmlinfo, @outkey) { |s| + line1 = "%{title}%{at}%{link}" + make_stream(line1, nil, s) + } + end + + FEED_NS = %r{xmlns.*http://(purl\.org/rss|www.w3c.org/1999/02/22-rdf)} + def htmlinfo_filter(s) + return nil unless s[:headers] and s[:headers]['x-rbot-location'] + return nil unless s[:headers]['content-type'].first.match(/xml|rss|atom|rdf/i) or + (s[:text].include?(" blob.title, :content => output.join(" | ")} + end + + # Display the known rss types + def rss_types(m, params) + ar = @bot.filter_names(@outkey) + ar.delete(:default) + m.reply ar.map { |k| k.to_s }.sort!.join(", ") + end + attr_reader :feeds def initialize super + + define_filters + if @registry.has_key?(:feeds) # When migrating from Ruby 1.8.5 to 1.8.6, dumped Mutexes may render the # data unrestorable. If this happens, we patch the data, thus allowing @@ -313,7 +464,7 @@ class RSSFeedsPlugin < Plugin } @feeds = @registry[:feeds] - raise unless @feeds + raise LoadError, "corrupted feed database" unless @feeds @registry.recovery = nil @@ -402,8 +553,10 @@ class RSSFeedsPlugin < Plugin "rss who watches #{Bold}handle#{Bold}: lists watches for rss #{Bold}handle#{Bold}" when "rewatch" "rss rewatch : restart threads that watch for changes in watched rss" + when "types" + "rss types : show the rss types for which an output format exist (all other types will use the default one)" else - "manage RSS feeds: rss show|list|watched|add|change|del(ete)|rm|(force)replace|watch|unwatch|rmwatch|rewatch|who watches" + "manage RSS feeds: rss types|show|list|watched|add|change|del(ete)|rm|(force)replace|watch|unwatch|rmwatch|rewatch|who watches" end end @@ -461,13 +614,12 @@ class RSSFeedsPlugin < Plugin fetched = fetchRss(feed, m, false) end return unless fetched or feed.xml - if not fetched and feed.items - m.reply "using old data" - else + if fetched or not feed.items parsed = parseRss(feed, m) - m.reply "using old data" unless parsed end return unless feed.items + m.reply "using old data" unless fetched and parsed and parsed > 0 + title = feed.title items = feed.items @@ -497,19 +649,34 @@ class RSSFeedsPlugin < Plugin def list_rss(m, params) wanted = params[:handle] - reply = String.new - @feeds.each { |handle, feed| - next if wanted and !handle.match(/#{wanted}/i) - reply << "#{feed.handle}: #{feed.url} (in format: #{feed.type ? feed.type : 'default'})" - (reply << " refreshing every #{Utils.secs_to_string(feed.refresh_rate)}") if feed.refresh_rate - (reply << " (watched)") if feed.watched_by?(m.replyto) - reply << "\n" - } - if reply.empty? + listed = @feeds.keys + if wanted + wanted_rx = Regexp.new(wanted, true) + listed.reject! { |handle| !handle.match(wanted_rx) } + end + listed.sort! + debug listed + if @bot.config['send.max_lines'] > 0 and listed.size > @bot.config['send.max_lines'] + reply = listed.inject([]) do |ar, handle| + feed = @feeds[handle] + string = handle.dup + (string << " (#{feed.type})") if feed.type + (string << " (watched)") if feed.watched_by?(m.replyto) + ar << string + end.join(', ') + elsif listed.size > 0 + reply = listed.inject([]) do |ar, handle| + feed = @feeds[handle] + string = "#{feed.handle}: #{feed.url} (in format: #{feed.type ? feed.type : 'default'})" + (string << " refreshing every #{Utils.secs_to_string(feed.refresh_rate)}") if feed.refresh_rate + (string << " (watched)") if feed.watched_by?(m.replyto) + ar << string + end.join("\n") + else reply = "no feeds found" reply << " matching #{wanted}" if wanted end - m.reply reply, :max_lines => reply.length + m.reply reply, :max_lines => 0 end def watched_rss(m, params) @@ -690,7 +857,8 @@ class RSSFeedsPlugin < Plugin if params and handle = params[:handle] feed = @feeds.fetch(handle.downcase, nil) if feed - @bot.timer.reschedule(@watch[feed.handle], 0) + feed.http_cache = false + @bot.timer.reschedule(@watch[feed.handle], (params[:delay] || 0).to_f) m.okay if m else m.reply _("no such feed %{handle}") % { :handle => handle } if m @@ -709,7 +877,7 @@ class RSSFeedsPlugin < Plugin private def watchRss(feed, m=nil) if @watch.has_key?(feed.handle) - report_problem("watcher thread for #{feed.handle} is already running", nil, m) + # report_problem("watcher thread for #{feed.handle} is already running", nil, m) return end status = Hash.new @@ -725,11 +893,18 @@ class RSSFeedsPlugin < Plugin failures = status[:failures] begin debug "fetching #{feed}" - first_run = !feed.last_fetched + + first_run = !feed.last_success + if (@bot.config['rss.announce_timeout'] > 0 && + (Time.now - feed.last_success > @bot.config['rss.announce_timeout'])) + debug "#{feed} wasn't polled for too long, supressing output" + first_run = true + end oldxml = feed.xml ? feed.xml.dup : nil - unless fetchRss(feed) + unless fetchRss(feed, nil, feed.http_cache) failures += 1 else + feed.http_cache = true if first_run debug "first run for #{feed}, getting items" parseRss(feed) @@ -737,27 +912,27 @@ class RSSFeedsPlugin < Plugin debug "xml for #{feed} didn't change" failures -= 1 if failures > 0 else - if not feed.items - debug "no previous items in feed #{feed}" - parseRss(feed) - failures -= 1 if failures > 0 - else - # This one is used for debugging - otxt = [] + # This one is used for debugging + otxt = [] + if feed.items.nil? + oids = [] + else # These are used for checking new items vs old ones - uid_opts = { :show_updated => @bot.config['rss.show_updated'] } oids = Set.new feed.items.map { |item| - uid = RSS.item_uid_for_bot(item, uid_opts) + uid = make_uid item otxt << item.to_s debug [uid, item].inspect debug [uid, otxt.last].inspect uid } + end - unless parseRss(feed) - debug "no items in feed #{feed}" + nitems = parseRss(feed) + if nitems.nil? failures += 1 + elsif nitems == 0 + debug "no items in feed #{feed}" else debug "Checking if new items are available for #{feed}" failures -= 1 if failures > 0 @@ -767,7 +942,7 @@ class RSSFeedsPlugin < Plugin # debug feed.xml dispItems = feed.items.reject { |item| - uid = RSS.item_uid_for_bot(item, uid_opts) + uid = make_uid item txt = item.to_s if oids.include?(uid) debug "rejecting old #{uid} #{item.inspect}" @@ -791,7 +966,6 @@ class RSSFeedsPlugin < Plugin debug "No new items found in #{feed}" end end - end end end rescue Exception => e @@ -823,16 +997,10 @@ class RSSFeedsPlugin < Plugin return seconds end - def select_nonempty(*ar) - debug ar - ret = ar.map { |i| (i && i.empty?) ? nil : i }.compact.first - (ret && ret.empty?) ? nil : ret - end - def printFormattedRss(feed, item, opts=nil) - debug item + # debug item places = feed.watchers - handle = "::#{feed.handle}:: " + handle = feed.handle.empty? ? "" : "::#{feed.handle}:: " date = String.new if opts places = opts[:places] if opts.key?(:places) @@ -850,7 +1018,7 @@ class RSSFeedsPlugin < Plugin else date = item.source.updated.content.to_s end - elsif item.respond_to?(:pubDate) + elsif item.respond_to?(:pubDate) if item.pubDate.class <= Time date = item.pubDate.strftime("%Y/%m/%d %H:%M") else @@ -887,13 +1055,13 @@ class RSSFeedsPlugin < Plugin desc_opt[:limit] = @bot.config['rss.text_max'] desc_opt[:a_href] = :link_out if @bot.config['rss.show_links'] - # We prefer content_encoded here as it tends to provide more html formatting + # We prefer content_encoded here as it tends to provide more html formatting # for use with ircify_html. if item.respond_to?(:content_encoded) && item.content_encoded desc = item.content_encoded.ircify_html(desc_opt) elsif item.respond_to?(:description) && item.description desc = item.description.ircify_html(desc_opt) - else + elsif item.respond_to?(:content) && item.content if item.content.type == "html" desc = item.content.content.ircify_html(desc_opt) else @@ -902,59 +1070,36 @@ class RSSFeedsPlugin < Plugin desc = desc.slice(0, desc_opt[:limit]) + "#{Reverse}...#{Reverse}" end end + else + desc = "(?)" end - link = item.link.href rescue item.link.chomp rescue nil + link = item.link! + link.strip! if link - category = select_nonempty((item.category.content rescue nil), (item.dc_subject rescue nil)) - author = select_nonempty((item.author.name.content rescue nil), (item.dc_creator rescue nil), (item.author rescue nil)) + category = item.category! || item.dc_subject! + category.strip! if category + author = item.dc_creator! || item.author! + author.strip! if author line1 = nil line2 = nil at = ((item.title && item.link) ? ' @ ' : '') - case feed.type - when 'blog' - author += " " if author - abt = category ? "about #{category} " : "" - line1 = "#{handle}#{date}#{author}blogged #{abt}at #{link}" - line2 = "#{handle}#{title} - #{desc}" - when 'photoblog' - author += " " if author - abt = category ? "under #{category} " : "" - line1 = "#{handle}#{date}#{author}added an image #{abt}at #{link}" - line2 = "#{handle}#{title} - #{desc}" - when 'news' - line1 = "#{handle}#{date}#{title} @ #{link}" - line2 = line2 = "#{handle}#{date}#{desc}" - when 'git' - author += " " if author - line1 = "#{handle}#{date}#{author}commited #{title} @ #{link}" - when 'forum' - line1 = "#{handle}#{date}#{title}#{at}#{link}" - when 'wiki' - line1 = "#{handle}#{date}#{title}#{at}#{link} has been edited by #{author}. #{desc}" - when 'gmane' - line1 = "#{handle}#{date}Message #{title} sent by #{author}. #{desc}" - when 'trac' - line1 = "#{handle}#{date}#{title} @ #{link}" - unless item.title =~ /^(?:Changeset \[(?:[\da-f]+)\]|\(git commit\))/ - line2 = "#{handle}#{date}#{desc}" - end - when '/.' - dept = "(from the #{item.slash_department} dept) " rescue nil - sec = " in section #{item.slash_section}" rescue nil + key = @bot.global_filter_name(feed.type, @outkey) + key = @bot.global_filter_name(:default, @outkey) unless @bot.has_filter?(key) + + output = @bot.filter(key, :item => item, :handle => handle, :date => date, + :title => title, :desc => desc, :link => link, + :category => category, :author => author, :at => at) + + return output if places.empty? - line1 = "#{handle}#{date}#{dept}#{title}#{at}#{link} (posted by #{author}#{sec})" - else - line1 = "#{handle}#{date}#{title}#{at}#{link}" - line1 << " (by #{author})" if author - end places.each { |loc| - @bot.say loc, line1, :overlong => :truncate - next unless line2 - @bot.say loc, line2, :overlong => :truncate + output.to_s.each_line { |line| + @bot.say loc, line, :overlong => :truncate + } } end @@ -983,8 +1128,16 @@ class RSSFeedsPlugin < Plugin # reassign the 0.9 RDFs to 1.0, and hope it goes right. xml.gsub!("xmlns=\"http://my.netscape.com/rdf/simple/0.9/\"", "xmlns=\"http://purl.org/rss/1.0/\"") + # make sure the parser doesn't double-convert in case the feed is not UTF-8 + xml.sub!(/<\?xml (.*?)\?>/) do |match| + if /\bencoding=(['"])(.*?)\1/.match(match) + match.sub!(/\bencoding=(['"])(?:.*?)\1/,'encoding="UTF-8"') + end + match + end feed.mutex.synchronize do feed.xml = xml + feed.last_success = Time.now end return true end @@ -993,29 +1146,42 @@ class RSSFeedsPlugin < Plugin return nil unless feed.xml feed.mutex.synchronize do xml = feed.xml - begin - ## do validate parse - rss = RSS::Parser.parse(xml) - debug "parsed and validated #{feed}" - rescue RSS::InvalidRSSError - ## do non validate parse for invalid RSS 1.0 + rss = nil + errors = [] + RSS::AVAILABLE_PARSERS.each do |parser| begin - rss = RSS::Parser.parse(xml, false) - debug "parsed but not validated #{feed}" + ## do validate parse + rss = RSS::Parser.parse(xml, true, true, parser) + debug "parsed and validated #{feed} with #{parser}" + break + rescue RSS::InvalidRSSError + begin + ## do non validate parse for invalid RSS 1.0 + rss = RSS::Parser.parse(xml, false, true, parser) + debug "parsed but not validated #{feed} with #{parser}" + break + rescue RSS::Error => e + errors << [parser, e, "parsing rss stream failed, whoops =("] + end rescue RSS::Error => e - report_problem("parsing rss stream failed, whoops =(", e, m) - return nil + errors << [parser, e, "parsing rss stream failed, oioi"] + rescue => e + errors << [parser, e, "processing error occured, sorry =("] end - rescue RSS::Error => e - report_problem("parsing rss stream failed, oioi", e, m) - return nil - rescue => e - report_problem("processing error occured, sorry =(", e, m) - return nil + end + unless errors.empty? + debug errors + self.send(:report_problem, errors.last[2], errors.last[1], m) + return nil unless rss end items = [] if rss.nil? - report_problem("#{feed} does not include RSS 1.0 or 0.9x/2.0", nil, m) + if xml.match(/xmlns\s*=\s*(['"])http:\/\/www.w3.org\/2005\/Atom\1/) and not defined?(RSS::Atom) + report_problem("#{feed.handle} @ #{feed.url} looks like an Atom feed, but your Ruby/RSS library doesn't seem to support it. Consider getting the latest version from http://raa.ruby-lang.org/project/rss/", nil, m) + else + report_problem("#{feed.handle} @ #{feed.url} doesn't seem to contain an RSS or Atom feed I can read", nil, m) + end + return nil else begin rss.output_encoding = 'UTF-8' @@ -1024,24 +1190,24 @@ class RSSFeedsPlugin < Plugin return nil end if rss.respond_to? :channel - rss.channel.title ||= "Unknown" + rss.channel.title ||= "(?)" title = rss.channel.title else title = rss.title.content end rss.items.each do |item| - item.title ||= "Unknown" + item.title ||= "(?)" items << item end end if items.empty? report_problem("no items found in the feed, maybe try weed?", e, m) - return nil + else + feed.title = title.strip + feed.items = items end - feed.title = title - feed.items = items - return true + return items.length end end end @@ -1103,5 +1269,7 @@ plugin.map 'rss unwatch :handle [in :chan]', :action => 'unwatch_rss' plugin.map 'rss rmwatch :handle [in :chan]', :action => 'unwatch_rss' -plugin.map 'rss rewatch [:handle]', +plugin.map 'rss rewatch [:handle] [:delay]', :action => 'rewatch_rss' +plugin.map 'rss types', + :action => 'rss_types'