data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class ::UrlLinkError < RuntimeError
   9 end
  10
  11 class UrlPlugin < Plugin
  12   TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
  13   LINK_INFO = "[Link Info]"
  14   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  15
  16   Config.register Config::IntegerValue.new('url.max_urls',
  17     :default => 100, :validate => Proc.new{|v| v > 0},
  18     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  19   Config.register Config::IntegerValue.new('url.display_link_info',
  20     :default => 0,
  21     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  22   Config.register Config::BooleanValue.new('url.titles_only',
  23     :default => false,
  24     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  25   Config.register Config::BooleanValue.new('url.first_par',
  26     :default => false,
  27     :desc => "Also try to get the first paragraph of a web page")
  28   Config.register Config::BooleanValue.new('url.info_on_list',
  29     :default => false,
  30     :desc => "Show link info when listing/searching for urls")
  31   Config.register Config::ArrayValue.new('url.no_info_hosts',
  32     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  33     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  34     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  35
  36
  37   def initialize
  38     super
  39     @registry.set_default(Array.new)
  40     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  41       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  42     end
  43     reset_no_info_hosts
  44   end
  45
  46   def reset_no_info_hosts
  47     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  48     debug "no info hosts regexp set to #{@no_info_hosts}"
  49   end
  50
  51   def help(plugin, topic="")
  52     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  53   end
  54
  55   def get_title_from_html(pagedata)
  56     return unless TITLE_RE.match(pagedata)
  57     $1.ircify_html
  58   end
  59
  60   def get_title_for_url(uri_str, nick = nil, channel = nil, ircline = nil)
  61
  62     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  63     return if url.scheme !~ /https?/
  64
  65     if url.host =~ @no_info_hosts
  66       return "Sorry, info retrieval for #{url.host} is disabled"
  67     end
  68
  69     logopts = Hash.new
  70     logopts[:nick] = nick if nick
  71     logopts[:channel] = channel if channel
  72     logopts[:ircline] = ircline if ircline
  73
  74     title = nil
  75     extra = String.new
  76
  77     begin
  78       debug "+ getting #{url.request_uri}"
  79       @bot.httputil.get_response(url) { |resp|
  80         case resp
  81         when Net::HTTPSuccess
  82
  83           debug resp.to_hash
  84
  85           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  86             # The page is text or HTML, so we can try finding a title and, if
  87             # requested, the first par.
  88             #
  89             # We act differently depending on whether we want the first par or
  90             # not: in the first case we download the initial part and the parse
  91             # it; in the second case we only download as much as we need to find
  92             # the title
  93             #
  94             if @bot.config['url.first_par']
  95               partial = resp.partial_body(@bot.config['http.info_bytes'])
  96               logopts[:title] = title = get_title_from_html(partial)
  97               if url.fragment and not url.fragment.empty?
  98                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  99                 partial.sub!(fragreg,'')
 100               end
 101               first_par = Utils.ircify_first_html_par(partial, :strip => title)
 102               unless first_par.empty?
 103                 logopts[:extra] = first_par
 104                 extra << ", #{Bold}text#{Bold}: #{first_par}"
 105               end
 106               call_event(:url_added, url.to_s, logopts)
 107               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
 108             else
 109               resp.partial_body(@bot.config['http.info_bytes']) { |part|
 110                 logopts[:title] = title = get_title_from_html(part)
 111                 call_event(:url_added, url.to_s, logopts)
 112                 return "#{Bold}title#{Bold}: #{title}" if title
 113               }
 114             end
 115           # if nothing was found, provide more basic info, as for non-html pages
 116           else
 117             resp.no_cache = true
 118           end
 119
 120           enc = resp['content-encoding']
 121           logopts[:extra] = String.new
 122           logopts[:extra] << "Content Type: #{resp['content-type']}"
 123           if enc
 124             logopts[:extra] << ", encoding: #{enc}"
 125             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 126           end
 127
 128           unless @bot.config['url.titles_only']
 129             # content doesn't have title, just display info.
 130             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 131             if size
 132               logopts[:extra] << ", size: #{size} bytes"
 133               size = ", #{Bold}size#{Bold}: #{size} bytes"
 134             end
 135             call_event(:url_added, url.to_s, logopts)
 136             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 137           end
 138           call_event(:url_added, url.to_s, logopts)
 139         else
 140           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 141         end
 142       }
 143       return nil
 144     rescue Exception => e
 145       case e
 146       when UrlLinkError
 147         raise e
 148       else
 149         error e
 150         raise "connecting to site/processing information (#{e.message})"
 151       end
 152     end
 153   end
 154
 155   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
 156     return if urls.empty?
 157     debug "found urls #{urls.inspect}"
 158     if m.public?
 159       list = @registry[m.target]
 160     else
 161       list = nil
 162     end
 163     urls_displayed = 0
 164     urls.each { |urlstr|
 165       debug "working on #{urlstr}"
 166       next unless urlstr =~ /^https?:/
 167       title = nil
 168       debug "display link info: #{display_info}"
 169       if display_info > urls_displayed
 170         urls_displayed += 1
 171         Thread.start do
 172           debug "Getting title for #{urlstr}..."
 173           begin
 174             title = get_title_for_url urlstr, m.source.nick, m.channel, m.message
 175             if title
 176               m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
 177               debug "Title found!"
 178             else
 179               debug "Title not found!"
 180             end
 181           rescue => e
 182             m.reply "Error #{e.message}"
 183           end
 184         end
 185       end
 186
 187       next unless list
 188
 189       # check to see if this url is already listed
 190       next if list.find {|u| u.url == urlstr }
 191
 192       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 193       debug "#{list.length} urls so far"
 194       if list.length > @bot.config['url.max_urls']
 195         list.pop
 196       end
 197       debug "storing url #{url.url}"
 198       list.unshift url
 199       debug "#{list.length} urls now"
 200     }
 201     @registry[m.target] = list
 202   end
 203
 204   def info(m, params)
 205     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 206     urls = URI.extract(escaped)
 207     handle_urls(m, urls, params[:urls].length)
 208   end
 209
 210   def listen(m)
 211     return unless m.kind_of?(PrivMessage)
 212     return if m.address?
 213
 214     escaped = URI.escape(m.message, OUR_UNSAFE)
 215     urls = URI.extract(escaped)
 216     handle_urls(m, urls)
 217   end
 218
 219   def reply_urls(opts={})
 220     list = opts[:list]
 221     max = opts[:max]
 222     channel = opts[:channel]
 223     m = opts[:msg]
 224     return unless list and max and m
 225     list[0..(max-1)].each do |url|
 226       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 227       if @bot.config['url.info_on_list']
 228         title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
 229         # If the url info was missing and we now have some, try to upgrade it
 230         if channel and title and not url.info
 231           ll = @registry[channel]
 232           debug ll
 233           if el = ll.find { |u| u.url == url.url }
 234             el.info = title
 235             @registry[channel] = ll
 236           end
 237         end
 238         disp << " --> #{title}" if title
 239       end
 240       m.reply disp, :overlong => :truncate
 241     end
 242   end
 243
 244   def urls(m, params)
 245     channel = params[:channel] ? params[:channel] : m.target
 246     max = params[:limit].to_i
 247     max = 10 if max > 10
 248     max = 1 if max < 1
 249     list = @registry[channel]
 250     if list.empty?
 251       m.reply "no urls seen yet for channel #{channel}"
 252     else
 253       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 254     end
 255   end
 256
 257   def search(m, params)
 258     channel = params[:channel] ? params[:channel] : m.target
 259     max = params[:limit].to_i
 260     string = params[:string]
 261     max = 10 if max > 10
 262     max = 1 if max < 1
 263     regex = Regexp.new(string, Regexp::IGNORECASE)
 264     list = @registry[channel].find_all {|url|
 265       regex.match(url.url) || regex.match(url.nick) ||
 266         (@bot.config['url.info_on_list'] && regex.match(url.info))
 267     }
 268     if list.empty?
 269       m.reply "no matches for channel #{channel}"
 270     else
 271       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 272     end
 273   end
 274 end
 275
 276 plugin = UrlPlugin.new
 277 plugin.map 'urls info *urls', :action => 'info'
 278 plugin.map 'urls search :channel :limit :string', :action => 'search',
 279                           :defaults => {:limit => 4},
 280                           :requirements => {:limit => /^\d+$/},
 281                           :public => false
 282 plugin.map 'urls search :limit :string', :action => 'search',
 283                           :defaults => {:limit => 4},
 284                           :requirements => {:limit => /^\d+$/},
 285                           :private => false
 286 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 287                           :requirements => {:limit => /^\d+$/},
 288                           :public => false
 289 plugin.map 'urls :limit', :defaults => {:limit => 4},
 290                           :requirements => {:limit => /^\d+$/},
 291                           :private => false