data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class ::UrlLinkError < RuntimeError
   9 end
  10
  11 class UrlPlugin < Plugin
  12   TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
  13   LINK_INFO = "[Link Info]"
  14   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  15
  16   BotConfig.register BotConfigIntegerValue.new('url.max_urls',
  17     :default => 100, :validate => Proc.new{|v| v > 0},
  18     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  19   BotConfig.register BotConfigIntegerValue.new('url.display_link_info',
  20     :default => 0,
  21     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  22   BotConfig.register BotConfigBooleanValue.new('url.titles_only',
  23     :default => false,
  24     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  25   BotConfig.register BotConfigBooleanValue.new('url.first_par',
  26     :default => false,
  27     :desc => "Also try to get the first paragraph of a web page")
  28   BotConfig.register BotConfigBooleanValue.new('url.info_on_list',
  29     :default => false,
  30     :desc => "Show link info when listing/searching for urls")
  31   BotConfig.register BotConfigArrayValue.new('url.no_info_hosts',
  32     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.0\.0\.1', '^172\.(1[6-9]|2\d|31)\.'],
  33     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  34     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  35
  36
  37   def initialize
  38     super
  39     @registry.set_default(Array.new)
  40     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  41       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  42     end
  43     reset_no_info_hosts
  44   end
  45
  46   def reset_no_info_hosts
  47     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  48     debug "no info hosts regexp set to #{@no_info_hosts}"
  49   end
  50
  51   def help(plugin, topic="")
  52     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  53   end
  54
  55   def get_title_from_html(pagedata)
  56     return unless TITLE_RE.match(pagedata)
  57     $1.ircify_html
  58   end
  59
  60   def get_title_for_url(uri_str, nick = nil, channel = nil, ircline = nil)
  61
  62     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  63     return if url.scheme !~ /https?/
  64
  65     if url.host =~ @no_info_hosts
  66       return "Sorry, info retrieval for #{url.host} is disabled"
  67     end
  68
  69     logopts = Hash.new
  70     logopts[:nick] = nick if nick
  71     logopts[:channel] = channel if channel
  72     logopts[:ircline] = ircline if ircline
  73
  74     title = nil
  75     extra = String.new
  76
  77     begin
  78       debug "+ getting #{url.request_uri}"
  79       @bot.httputil.get_response(url) { |resp|
  80         case resp
  81         when Net::HTTPSuccess
  82
  83           debug resp.to_hash
  84
  85           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  86             # The page is text or HTML, so we can try finding a title and, if
  87             # requested, the first par.
  88             #
  89             # We act differently depending on whether we want the first par or
  90             # not: in the first case we download the initial part and the parse
  91             # it; in the second case we only download as much as we need to find
  92             # the title
  93             #
  94             if @bot.config['url.first_par']
  95               partial = resp.partial_body(@bot.config['http.info_bytes'])
  96               logopts[:title] = title = get_title_from_html(partial)
  97               if url.fragment and not url.fragment.empty?
  98                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  99                 partial.sub!(fragreg,'')
 100               end
 101               first_par = Utils.ircify_first_html_par(partial, :strip => title)
 102               unless first_par.empty?
 103                 logopts[:extra] = first_par
 104                 extra << ", #{Bold}text#{Bold}: #{first_par}"
 105               end
 106               call_event(:url_added, url.to_s, logopts)
 107               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
 108             else
 109               resp.partial_body(@bot.config['http.info_bytes']) { |part|
 110                 logopts[:title] = title = get_title_from_html(part)
 111                 call_event(:url_added, url.to_s, logopts)
 112                 return "#{Bold}title#{Bold}: #{title}" if title
 113               }
 114             end
 115           # if nothing was found, provide more basic info, as for non-html pages
 116           else
 117             resp.no_cache = true
 118           end
 119
 120           enc = resp['content-encoding']
 121           logopts[:extra] = String.new
 122           logopts[:extra] << "Content Type: #{resp['content-type']}"
 123           if enc
 124             logopts[:extra] << ", encoding: #{enc}"
 125             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 126           end
 127
 128           unless @bot.config['url.titles_only']
 129             # content doesn't have title, just display info.
 130             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 131             if size
 132               logopts[:extra] << ", size: #{size} bytes"
 133               size = ", #{Bold}size#{Bold}: #{size} bytes"
 134             end
 135             call_event(:url_added, url.to_s, logopts)
 136             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 137           end
 138           call_event(:url_added, url.to_s, logopts)
 139         else
 140           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 141         end
 142       }
 143       return nil
 144     rescue Exception => e
 145       case e
 146       when UrlLinkError
 147         raise e
 148       else
 149         error e
 150         raise "connecting to site/processing information (#{e.message})"
 151       end
 152     end
 153   end
 154
 155   def listen(m)
 156     return unless m.kind_of?(PrivMessage)
 157     return if m.address?
 158
 159     escaped = URI.escape(m.message, OUR_UNSAFE)
 160     urls = URI.extract(escaped)
 161     return if urls.empty?
 162     debug "found urls #{urls.inspect}"
 163     list = @registry[m.target]
 164     urls_displayed = 0
 165     urls.each { |urlstr|
 166       debug "working on #{urlstr}"
 167       next unless urlstr =~ /^https?:/
 168       title = nil
 169       debug "display link info: #{@bot.config['url.display_link_info']}"
 170       if @bot.config['url.display_link_info'] > urls_displayed
 171         urls_displayed += 1
 172         Thread.start do
 173           debug "Getting title for #{urlstr}..."
 174           begin
 175             title = get_title_for_url urlstr, m.source.nick, m.channel, m.message
 176             if title
 177               m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
 178               debug "Title found!"
 179             else
 180               debug "Title not found!"
 181             end
 182           rescue => e
 183             m.reply "Error #{e.message}"
 184           end
 185         end
 186       end
 187
 188       # check to see if this url is already listed
 189       next if list.find {|u| u.url == urlstr }
 190
 191       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 192       debug "#{list.length} urls so far"
 193       if list.length > @bot.config['url.max_urls']
 194         list.pop
 195       end
 196       debug "storing url #{url.url}"
 197       list.unshift url
 198       debug "#{list.length} urls now"
 199     }
 200     @registry[m.target] = list
 201   end
 202
 203   def reply_urls(opts={})
 204     list = opts[:list]
 205     max = opts[:max]
 206     channel = opts[:channel]
 207     m = opts[:msg]
 208     return unless list and max and m
 209     list[0..(max-1)].each do |url|
 210       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 211       if @bot.config['url.info_on_list']
 212         title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
 213         # If the url info was missing and we now have some, try to upgrade it
 214         if channel and title and not url.info
 215           ll = @registry[channel]
 216           debug ll
 217           if el = ll.find { |u| u.url == url.url }
 218             el.info = title
 219             @registry[channel] = ll
 220           end
 221         end
 222         disp << " --> #{title}" if title
 223       end
 224       m.reply disp, :overlong => :truncate
 225     end
 226   end
 227
 228   def urls(m, params)
 229     channel = params[:channel] ? params[:channel] : m.target
 230     max = params[:limit].to_i
 231     max = 10 if max > 10
 232     max = 1 if max < 1
 233     list = @registry[channel]
 234     if list.empty?
 235       m.reply "no urls seen yet for channel #{channel}"
 236     else
 237       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 238     end
 239   end
 240
 241   def search(m, params)
 242     channel = params[:channel] ? params[:channel] : m.target
 243     max = params[:limit].to_i
 244     string = params[:string]
 245     max = 10 if max > 10
 246     max = 1 if max < 1
 247     regex = Regexp.new(string, Regexp::IGNORECASE)
 248     list = @registry[channel].find_all {|url|
 249       regex.match(url.url) || regex.match(url.nick) ||
 250         (@bot.config['url.info_on_list'] && regex.match(url.info))
 251     }
 252     if list.empty?
 253       m.reply "no matches for channel #{channel}"
 254     else
 255       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 256     end
 257   end
 258 end
 259
 260 plugin = UrlPlugin.new
 261 plugin.map 'urls search :channel :limit :string', :action => 'search',
 262                           :defaults => {:limit => 4},
 263                           :requirements => {:limit => /^\d+$/},
 264                           :public => false
 265 plugin.map 'urls search :limit :string', :action => 'search',
 266                           :defaults => {:limit => 4},
 267                           :requirements => {:limit => /^\d+$/},
 268                           :private => false
 269 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 270                           :requirements => {:limit => /^\d+$/},
 271                           :public => false
 272 plugin.map 'urls :limit', :defaults => {:limit => 4},
 273                           :requirements => {:limit => /^\d+$/},
 274                           :private => false