data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.auto_shorten',
  19     :default => false,
  20     :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
  21   Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
  22     :default => 48,
  23     :desc => "Minimum length of URL to auto-shorten.  Only has an effect when url.auto_shorten is true.")
  24   Config.register Config::BooleanValue.new('url.titles_only',
  25     :default => false,
  26     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  27   Config.register Config::BooleanValue.new('url.first_par',
  28     :default => false,
  29     :desc => "Also try to get the first paragraph of a web page")
  30   Config.register Config::IntegerValue.new('url.first_par_length',
  31     :default => 150,
  32     :desc => "The max length of the first paragraph")
  33   Config.register Config::ArrayValue.new('url.first_par_whitelist',
  34     :default => ['twitter.com'],
  35     :desc => "List of url patterns to show the content for.")
  36   Config.register Config::BooleanValue.new('url.info_on_list',
  37     :default => false,
  38     :desc => "Show link info when listing/searching for urls")
  39   Config.register Config::ArrayValue.new('url.no_info_hosts',
  40     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  41     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  42     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  43   Config.register Config::ArrayValue.new('url.only_on_channels',
  44     :desc => "Show link info only on these channels",
  45     :default => [])
  46   Config.register Config::ArrayValue.new('url.ignore',
  47     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  48     :default => [])
  49
  50   def initialize
  51     super
  52     @registry.set_default(Array.new)
  53     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  54       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  55     end
  56     reset_no_info_hosts
  57     self.filter_group = :htmlinfo
  58     load_filters
  59   end
  60
  61   def reset_no_info_hosts
  62     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  63     debug "no info hosts regexp set to #{@no_info_hosts}"
  64   end
  65
  66   def help(plugin, topic="")
  67     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  68   end
  69
  70   def get_title_from_html(pagedata)
  71     return pagedata.ircify_html_title
  72   end
  73
  74   def get_title_for_url(uri_str, opts = {})
  75
  76     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  77     return if url.scheme !~ /https?/
  78
  79     # also check the ip, the canonical name and the aliases
  80     begin
  81       checks = TCPSocket.gethostbyname(url.host)
  82       checks.delete_at(-2)
  83     rescue => e
  84       return "Unable to retrieve info for #{url.host}: #{e.message}"
  85     end
  86
  87     checks << url.host
  88     checks.flatten!
  89
  90     unless checks.grep(@no_info_hosts).empty?
  91       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  92     end
  93
  94     logopts = opts.dup
  95
  96     title = nil
  97     extra = []
  98
  99     begin
 100       debug "+ getting info for #{url.request_uri}"
 101       info = @bot.filter(:htmlinfo, url)
 102       logopts[:htmlinfo] = info
 103       resp = info[:headers]
 104
 105       logopts[:title] = title = info[:title]
 106
 107       if info[:content]
 108         logopts[:extra] = info[:content]
 109
 110         max_length = @bot.config['url.first_par_length']
 111
 112         whitelist = @bot.config['url.first_par_whitelist']
 113         content = nil
 114         if whitelist.length > 0
 115           whitelist.each do |pattern|
 116             if Regexp.new(pattern, Regexp::IGNORECASE).match(url.to_s)
 117               content = info[:content][0...max_length]
 118               break
 119             end
 120           end
 121         else
 122           content = info[:content][0...max_length]
 123         end
 124
 125         extra << "#{Bold}text#{Bold}: #{content}" if @bot.config['url.first_par'] and content
 126       else
 127         logopts[:extra] = String.new
 128         logopts[:extra] << "Content Type: #{resp['content-type']}"
 129         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 130         if enc = resp['content-encoding']
 131           logopts[:extra] << ", encoding: #{enc}"
 132           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 133         end
 134
 135         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 136         if size
 137           logopts[:extra] << ", size: #{size} bytes"
 138           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 139         end
 140       end
 141     rescue Exception => e
 142       case e
 143       when UrlLinkError
 144         raise e
 145       else
 146         error e
 147         raise "connecting to site/processing information (#{e.message})"
 148       end
 149     end
 150
 151     call_event(:url_added, url.to_s, logopts)
 152     if title
 153       extra.unshift("#{Bold}title#{Bold}: #{title}")
 154     end
 155     return extra.join(", ") if title or not @bot.config['url.titles_only']
 156   end
 157
 158   def handle_urls(m, params={})
 159     opts = {
 160       :display_info => @bot.config['url.display_link_info'],
 161       :channels => @bot.config['url.only_on_channels'],
 162       :ignore => @bot.config['url.ignore']
 163     }.merge params
 164     urls = opts[:urls]
 165     display_info= opts[:display_info]
 166     channels = opts[:channels]
 167     ignore = opts[:ignore]
 168
 169     unless channels.empty?
 170       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 171     end
 172
 173     ignore.each { |u| return if m.source.matches?(u) }
 174
 175     return if urls.empty?
 176     debug "found urls #{urls.inspect}"
 177     list = m.public? ? @registry[m.target] : nil
 178     debug "display link info: #{display_info}"
 179     urls_displayed = 0
 180     urls.each do |urlstr|
 181       debug "working on #{urlstr}"
 182       next unless urlstr =~ /^https?:\/\/./
 183       if @bot.config['url.auto_shorten'] == true and
 184          urlstr.length >= @bot.config['url.auto_shorten_min_length']
 185         m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
 186         next
 187       end
 188       title = nil
 189       debug "Getting title for #{urlstr}..."
 190       reply = nil
 191       begin
 192         title = get_title_for_url(urlstr,
 193                                   :always_reply => m.address?,
 194                                   :nick => m.source.nick,
 195                                   :channel => m.channel,
 196                                   :ircline => m.message)
 197         debug "Title #{title ? '' : 'not '} found"
 198         reply = "#{LINK_INFO} #{title}" if title
 199       rescue => e
 200         debug e
 201         # we might get a 404 because of trailing punctuation, so we try again
 202         # with the last character stripped. this might generate invalid URIs
 203         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 204         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 205           # chop off last non-word character from the unescaped version of
 206           # the URL, and retry if we still have enough string to look like a
 207           # minimal URL
 208           unescaped = URI.unescape(urlstr)
 209           debug "Unescaped: #{unescaped}"
 210           if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
 211             urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
 212             retry
 213           else
 214             debug "Not retrying #{unescaped}"
 215           end
 216         end
 217         reply = "Error #{e.message}"
 218       end
 219
 220       if display_info > urls_displayed
 221         if reply
 222           m.reply reply, :overlong => :truncate, :to => :public,
 223             :nick => (m.address? ? :auto : false)
 224           urls_displayed += 1
 225         end
 226       end
 227
 228       next unless list
 229
 230       # check to see if this url is already listed
 231       next if list.find {|u| u.url == urlstr }
 232
 233       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 234       debug "#{list.length} urls so far"
 235       list.pop if list.length > @bot.config['url.max_urls']
 236       debug "storing url #{url.url}"
 237       list.unshift url
 238       debug "#{list.length} urls now"
 239     end
 240     @registry[m.target] = list
 241   end
 242
 243   def info(m, params)
 244     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 245     urls = URI.extract(escaped)
 246     Thread.new do
 247       handle_urls(m,
 248                   :urls => urls,
 249                   :display_info => params[:urls].length,
 250                   :channels => [])
 251     end
 252   end
 253
 254   def message(m)
 255     return if m.address?
 256
 257     escaped = URI.escape(m.message, OUR_UNSAFE)
 258     urls = URI.extract(escaped, ['http', 'https'])
 259     return if urls.empty?
 260     Thread.new { handle_urls(m, :urls => urls) }
 261   end
 262
 263   def reply_urls(opts={})
 264     list = opts[:list]
 265     max = opts[:max]
 266     channel = opts[:channel]
 267     m = opts[:msg]
 268     return unless list and max and m
 269     list[0..(max-1)].each do |url|
 270       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 271       if @bot.config['url.info_on_list']
 272         title = url.info ||
 273           get_title_for_url(url.url,
 274                             :nick => url.nick, :channel => channel) rescue nil
 275         # If the url info was missing and we now have some, try to upgrade it
 276         if channel and title and not url.info
 277           ll = @registry[channel]
 278           debug ll
 279           if el = ll.find { |u| u.url == url.url }
 280             el.info = title
 281             @registry[channel] = ll
 282           end
 283         end
 284         disp << " --> #{title}" if title
 285       end
 286       m.reply disp, :overlong => :truncate
 287     end
 288   end
 289
 290   def urls(m, params)
 291     channel = params[:channel] ? params[:channel] : m.target
 292     max = params[:limit].to_i
 293     max = 10 if max > 10
 294     max = 1 if max < 1
 295     list = @registry[channel]
 296     if list.empty?
 297       m.reply "no urls seen yet for channel #{channel}"
 298     else
 299       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 300     end
 301   end
 302
 303   def search(m, params)
 304     channel = params[:channel] ? params[:channel] : m.target
 305     max = params[:limit].to_i
 306     string = params[:string]
 307     max = 10 if max > 10
 308     max = 1 if max < 1
 309     regex = Regexp.new(string, Regexp::IGNORECASE)
 310     list = @registry[channel].find_all {|url|
 311       regex.match(url.url) || regex.match(url.nick) ||
 312         (@bot.config['url.info_on_list'] && regex.match(url.info))
 313     }
 314     if list.empty?
 315       m.reply "no matches for channel #{channel}"
 316     else
 317       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 318     end
 319   end
 320 end
 321
 322 plugin = UrlPlugin.new
 323 plugin.map 'urls info *urls', :action => 'info'
 324 plugin.map 'url info *urls', :action => 'info'
 325 plugin.map 'urls search :channel :limit :string', :action => 'search',
 326                           :defaults => {:limit => 4},
 327                           :requirements => {:limit => /^\d+$/},
 328                           :public => false
 329 plugin.map 'urls search :limit :string', :action => 'search',
 330                           :defaults => {:limit => 4},
 331                           :requirements => {:limit => /^\d+$/},
 332                           :private => false
 333 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 334                           :requirements => {:limit => /^\d+$/},
 335                           :public => false
 336 plugin.map 'urls :limit', :defaults => {:limit => 4},
 337                           :requirements => {:limit => /^\d+$/},
 338                           :private => false