data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class ::UrlLinkError < RuntimeError
   9 end
  10
  11 class UrlPlugin < Plugin
  12   TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
  13   LINK_INFO = "[Link Info]"
  14   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  15
  16   Config.register Config::IntegerValue.new('url.max_urls',
  17     :default => 100, :validate => Proc.new{|v| v > 0},
  18     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  19   Config.register Config::IntegerValue.new('url.display_link_info',
  20     :default => 0,
  21     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  22   Config.register Config::BooleanValue.new('url.titles_only',
  23     :default => false,
  24     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  25   Config.register Config::BooleanValue.new('url.first_par',
  26     :default => false,
  27     :desc => "Also try to get the first paragraph of a web page")
  28   Config.register Config::BooleanValue.new('url.info_on_list',
  29     :default => false,
  30     :desc => "Show link info when listing/searching for urls")
  31   Config.register Config::ArrayValue.new('url.no_info_hosts',
  32     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  33     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  34     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  35
  36
  37   def initialize
  38     super
  39     @registry.set_default(Array.new)
  40     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  41       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  42     end
  43     reset_no_info_hosts
  44   end
  45
  46   def reset_no_info_hosts
  47     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  48     debug "no info hosts regexp set to #{@no_info_hosts}"
  49   end
  50
  51   def help(plugin, topic="")
  52     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  53   end
  54
  55   def get_title_from_html(pagedata)
  56     return unless TITLE_RE.match(pagedata)
  57     $1.ircify_html
  58   end
  59
  60   def get_title_for_url(uri_str, opts = {})
  61
  62     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  63     return if url.scheme !~ /https?/
  64
  65     if url.host =~ @no_info_hosts
  66       return "Sorry, info retrieval for #{url.host} is disabled"
  67     end
  68
  69     logopts = opts.dup
  70
  71     title = nil
  72     extra = String.new
  73
  74     begin
  75       debug "+ getting #{url.request_uri}"
  76       @bot.httputil.get_response(url) { |resp|
  77         case resp
  78         when Net::HTTPSuccess
  79
  80           debug resp.to_hash
  81
  82           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  83             # The page is text or HTML, so we can try finding a title and, if
  84             # requested, the first par.
  85             #
  86             # We act differently depending on whether we want the first par or
  87             # not: in the first case we download the initial part and the parse
  88             # it; in the second case we only download as much as we need to find
  89             # the title
  90             #
  91             if @bot.config['url.first_par']
  92               partial = resp.partial_body(@bot.config['http.info_bytes'])
  93               logopts[:title] = title = get_title_from_html(partial)
  94               if url.fragment and not url.fragment.empty?
  95                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  96                 partial.sub!(fragreg,'')
  97               end
  98               first_par = Utils.ircify_first_html_par(partial, :strip => title)
  99               unless first_par.empty?
 100                 logopts[:extra] = first_par
 101                 extra << ", #{Bold}text#{Bold}: #{first_par}"
 102               end
 103               call_event(:url_added, url.to_s, logopts)
 104               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
 105             else
 106               resp.partial_body(@bot.config['http.info_bytes']) { |part|
 107                 logopts[:title] = title = get_title_from_html(part)
 108                 call_event(:url_added, url.to_s, logopts)
 109                 return "#{Bold}title#{Bold}: #{title}" if title
 110               }
 111             end
 112           # if nothing was found, provide more basic info, as for non-html pages
 113           else
 114             resp.no_cache = true
 115           end
 116
 117           enc = resp['content-encoding']
 118           logopts[:extra] = String.new
 119           logopts[:extra] << "Content Type: #{resp['content-type']}"
 120           if enc
 121             logopts[:extra] << ", encoding: #{enc}"
 122             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 123           end
 124
 125           unless @bot.config['url.titles_only']
 126             # content doesn't have title, just display info.
 127             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 128             if size
 129               logopts[:extra] << ", size: #{size} bytes"
 130               size = ", #{Bold}size#{Bold}: #{size} bytes"
 131             end
 132             call_event(:url_added, url.to_s, logopts)
 133             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 134           end
 135           call_event(:url_added, url.to_s, logopts)
 136         else
 137           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 138         end
 139       }
 140       return nil
 141     rescue Exception => e
 142       case e
 143       when UrlLinkError
 144         raise e
 145       else
 146         error e
 147         raise "connecting to site/processing information (#{e.message})"
 148       end
 149     end
 150   end
 151
 152   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
 153     return if urls.empty?
 154     debug "found urls #{urls.inspect}"
 155     list = m.public? ? @registry[m.target] : nil
 156     debug "display link info: #{display_info}"
 157     urls_displayed = 0
 158     urls.each do |urlstr|
 159       debug "working on #{urlstr}"
 160       next unless urlstr =~ /^https?:/
 161       title = nil
 162       debug "Getting title for #{urlstr}..."
 163       begin
 164         title = get_title_for_url(urlstr,
 165                                   :nick => m.source.nick,
 166                                   :channel => m.channel,
 167                                   :ircline => m.message)
 168         debug "Title #{title ? '' : 'not '} found"
 169       rescue => e
 170         m.reply "Error #{e.message}"
 171       end
 172
 173       if display_info > urls_displayed
 174         if title
 175           m.reply("#{LINK_INFO} #{title}", :overlong => :truncate)
 176           urls_displayed += 1
 177         end
 178       end
 179
 180       next unless list
 181
 182       # check to see if this url is already listed
 183       next if list.find {|u| u.url == urlstr }
 184
 185       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 186       debug "#{list.length} urls so far"
 187       list.pop if list.length > @bot.config['url.max_urls']
 188       debug "storing url #{url.url}"
 189       list.unshift url
 190       debug "#{list.length} urls now"
 191     end
 192     @registry[m.target] = list
 193   end
 194
 195   def info(m, params)
 196     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 197     urls = URI.extract(escaped)
 198     Thread.new { handle_urls(m, urls, params[:urls].length) }
 199   end
 200
 201   def listen(m)
 202     return unless m.kind_of?(PrivMessage)
 203     return if m.address?
 204
 205     escaped = URI.escape(m.message, OUR_UNSAFE)
 206     urls = URI.extract(escaped)
 207     Thread.new { handle_urls(m, urls) }
 208   end
 209
 210   def reply_urls(opts={})
 211     list = opts[:list]
 212     max = opts[:max]
 213     channel = opts[:channel]
 214     m = opts[:msg]
 215     return unless list and max and m
 216     list[0..(max-1)].each do |url|
 217       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 218       if @bot.config['url.info_on_list']
 219         title = url.info ||
 220           get_title_for_url(url.url,
 221                             :nick => url.nick, :channel => channel) rescue nil
 222         # If the url info was missing and we now have some, try to upgrade it
 223         if channel and title and not url.info
 224           ll = @registry[channel]
 225           debug ll
 226           if el = ll.find { |u| u.url == url.url }
 227             el.info = title
 228             @registry[channel] = ll
 229           end
 230         end
 231         disp << " --> #{title}" if title
 232       end
 233       m.reply disp, :overlong => :truncate
 234     end
 235   end
 236
 237   def urls(m, params)
 238     channel = params[:channel] ? params[:channel] : m.target
 239     max = params[:limit].to_i
 240     max = 10 if max > 10
 241     max = 1 if max < 1
 242     list = @registry[channel]
 243     if list.empty?
 244       m.reply "no urls seen yet for channel #{channel}"
 245     else
 246       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 247     end
 248   end
 249
 250   def search(m, params)
 251     channel = params[:channel] ? params[:channel] : m.target
 252     max = params[:limit].to_i
 253     string = params[:string]
 254     max = 10 if max > 10
 255     max = 1 if max < 1
 256     regex = Regexp.new(string, Regexp::IGNORECASE)
 257     list = @registry[channel].find_all {|url|
 258       regex.match(url.url) || regex.match(url.nick) ||
 259         (@bot.config['url.info_on_list'] && regex.match(url.info))
 260     }
 261     if list.empty?
 262       m.reply "no matches for channel #{channel}"
 263     else
 264       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 265     end
 266   end
 267 end
 268
 269 plugin = UrlPlugin.new
 270 plugin.map 'urls info *urls', :action => 'info'
 271 plugin.map 'url info *urls', :action => 'info'
 272 plugin.map 'urls search :channel :limit :string', :action => 'search',
 273                           :defaults => {:limit => 4},
 274                           :requirements => {:limit => /^\d+$/},
 275                           :public => false
 276 plugin.map 'urls search :limit :string', :action => 'search',
 277                           :defaults => {:limit => 4},
 278                           :requirements => {:limit => /^\d+$/},
 279                           :private => false
 280 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 281                           :requirements => {:limit => /^\d+$/},
 282                           :public => false
 283 plugin.map 'urls :limit', :defaults => {:limit => 4},
 284                           :requirements => {:limit => /^\d+$/},
 285                           :private => false