data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 require 'socket'
   7
   8 define_structure :Url, :channel, :nick, :time, :url, :info
   9
  10 class UrlPlugin < Plugin
  11   LINK_INFO = "[Link Info]"
  12   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  13
  14   Config.register Config::IntegerValue.new('url.max_urls',
  15     :default => 100, :validate => Proc.new{|v| v > 0},
  16     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  17   Config.register Config::IntegerValue.new('url.display_link_info',
  18     :default => 0,
  19     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  20   Config.register Config::BooleanValue.new('url.auto_shorten',
  21     :default => false,
  22     :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
  23   Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
  24     :default => 48,
  25     :desc => "Minimum length of URL to auto-shorten.  Only has an effect when url.auto_shorten is true.")
  26   Config.register Config::BooleanValue.new('url.titles_only',
  27     :default => false,
  28     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  29   Config.register Config::BooleanValue.new('url.first_par',
  30     :default => false,
  31     :desc => "Also try to get the first paragraph of a web page")
  32   Config.register Config::IntegerValue.new('url.first_par_length',
  33     :default => 150,
  34     :desc => "The max length of the first paragraph")
  35   Config.register Config::ArrayValue.new('url.first_par_whitelist',
  36     :default => ['twitter.com'],
  37     :desc => "List of url patterns to show the content for.")
  38   Config.register Config::BooleanValue.new('url.info_on_list',
  39     :default => false,
  40     :desc => "Show link info when listing/searching for urls")
  41   Config.register Config::ArrayValue.new('url.no_info_hosts',
  42     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  43     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  44     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  45   Config.register Config::ArrayValue.new('url.only_on_channels',
  46     :desc => "Show link info only on these channels",
  47     :default => [])
  48   Config.register Config::ArrayValue.new('url.ignore',
  49     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  50     :default => [])
  51
  52   def initialize
  53     super
  54     @registry.set_default(Array.new)
  55     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  56       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  57     end
  58     reset_no_info_hosts
  59     self.filter_group = :htmlinfo
  60     load_filters
  61   end
  62
  63   def reset_no_info_hosts
  64     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  65     debug "no info hosts regexp set to #{@no_info_hosts}"
  66   end
  67
  68   def help(plugin, topic="")
  69     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  70   end
  71
  72   def get_title_from_html(pagedata)
  73     return pagedata.ircify_html_title
  74   end
  75
  76   def get_title_for_url(uri_str, opts = {})
  77
  78     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  79     return if url.scheme !~ /https?/
  80
  81     # also check the ip, the canonical name and the aliases
  82     begin
  83       checks = Addrinfo.getaddrinfo(url.host, nil).map { |addr| addr.ip_address }
  84     rescue => e
  85       return "Unable to retrieve info for #{url.host}: #{e.message}"
  86     end
  87
  88     checks << url.host
  89     checks.flatten!
  90
  91     unless checks.grep(@no_info_hosts).empty?
  92       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  93     end
  94
  95     logopts = opts.dup
  96
  97     title = nil
  98     extra = []
  99
 100     begin
 101       debug "+ getting info for #{url.request_uri}"
 102       info = @bot.filter(:htmlinfo, url)
 103       logopts[:htmlinfo] = info
 104       resp = info[:headers]
 105
 106       logopts[:title] = title = info[:title]
 107
 108       if info[:content]
 109         logopts[:extra] = info[:content]
 110
 111         max_length = @bot.config['url.first_par_length']
 112
 113         whitelist = @bot.config['url.first_par_whitelist']
 114         content = nil
 115         if whitelist.length > 0
 116           whitelist.each do |pattern|
 117             if Regexp.new(pattern, Regexp::IGNORECASE).match(url.to_s)
 118               content = info[:content][0...max_length]
 119               break
 120             end
 121           end
 122         else
 123           content = info[:content][0...max_length]
 124         end
 125
 126         extra << "#{Bold}text#{Bold}: #{content}" if @bot.config['url.first_par'] and content
 127       else
 128         logopts[:extra] = String.new
 129         logopts[:extra] << "Content Type: #{resp['content-type']}"
 130         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 131         if enc = resp['content-encoding']
 132           logopts[:extra] << ", encoding: #{enc}"
 133           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 134         end
 135
 136         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 137         if size
 138           logopts[:extra] << ", size: #{size} bytes"
 139           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 140         end
 141       end
 142     rescue Exception => e
 143       case e
 144       when UrlLinkError
 145         raise e
 146       else
 147         error e
 148         raise "connecting to site/processing information (#{e.message})"
 149       end
 150     end
 151
 152     call_event(:url_added, url.to_s, logopts)
 153     if title
 154       extra.unshift("#{Bold}title#{Bold}: #{title}")
 155     end
 156     return extra.join(", ") if title or not @bot.config['url.titles_only']
 157   end
 158
 159   def handle_urls(m, params={})
 160     opts = {
 161       :display_info => @bot.config['url.display_link_info'],
 162       :channels => @bot.config['url.only_on_channels'],
 163       :ignore => @bot.config['url.ignore']
 164     }.merge params
 165     urls = opts[:urls]
 166     display_info= opts[:display_info]
 167     channels = opts[:channels]
 168     ignore = opts[:ignore]
 169
 170     unless channels.empty?
 171       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 172     end
 173
 174     ignore.each { |u| return if m.source.matches?(u) }
 175
 176     return if urls.empty?
 177     debug "found urls #{urls.inspect}"
 178     list = m.public? ? @registry[m.target] : nil
 179     debug "display link info: #{display_info}"
 180     urls_displayed = 0
 181     urls.each do |urlstr|
 182       debug "working on #{urlstr}"
 183       next unless urlstr =~ /^https?:\/\/./
 184       if @bot.config['url.auto_shorten'] == true and
 185          urlstr.length >= @bot.config['url.auto_shorten_min_length']
 186         m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
 187         next
 188       end
 189       title = nil
 190       debug "Getting title for #{urlstr}..."
 191       reply = nil
 192       begin
 193         title = get_title_for_url(urlstr,
 194                                   :always_reply => m.address?,
 195                                   :nick => m.source.nick,
 196                                   :channel => m.channel,
 197                                   :ircline => m.message)
 198         debug "Title #{title ? '' : 'not '} found"
 199         reply = "#{LINK_INFO} #{title}" if title
 200       rescue => e
 201         debug e
 202         # we might get a 404 because of trailing punctuation, so we try again
 203         # with the last character stripped. this might generate invalid URIs
 204         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 205         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 206           # chop off last non-word character from the unescaped version of
 207           # the URL, and retry if we still have enough string to look like a
 208           # minimal URL
 209           unescaped = URI.unescape(urlstr)
 210           debug "Unescaped: #{unescaped}"
 211           if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
 212             urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
 213             retry
 214           else
 215             debug "Not retrying #{unescaped}"
 216           end
 217         end
 218         reply = "Error #{e.message}"
 219       end
 220
 221       if display_info > urls_displayed
 222         if reply
 223           m.reply reply, :overlong => :truncate, :to => :public,
 224             :nick => (m.address? ? :auto : false)
 225           urls_displayed += 1
 226         end
 227       end
 228
 229       next unless list
 230
 231       # check to see if this url is already listed
 232       next if list.find {|u| u.url == urlstr }
 233
 234       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 235       debug "#{list.length} urls so far"
 236       list.pop if list.length > @bot.config['url.max_urls']
 237       debug "storing url #{url.url}"
 238       list.unshift url
 239       debug "#{list.length} urls now"
 240     end
 241     @registry[m.target] = list
 242   end
 243
 244   def info(m, params)
 245     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 246     urls = URI.extract(escaped)
 247     Thread.new do
 248       handle_urls(m,
 249                   :urls => urls,
 250                   :display_info => params[:urls].length,
 251                   :channels => [])
 252     end
 253   end
 254
 255   def message(m)
 256     return if m.address?
 257
 258     urls = URI.extract(m.message, ['http', 'https'])
 259     return if urls.empty?
 260     Thread.new { handle_urls(m, :urls => urls) }
 261   end
 262
 263   def reply_urls(opts={})
 264     list = opts[:list]
 265     max = opts[:max]
 266     channel = opts[:channel]
 267     m = opts[:msg]
 268     return unless list and max and m
 269     list[0..(max-1)].each do |url|
 270       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 271       if @bot.config['url.info_on_list']
 272         title = url.info ||
 273           get_title_for_url(url.url,
 274                             :nick => url.nick, :channel => channel) rescue nil
 275         # If the url info was missing and we now have some, try to upgrade it
 276         if channel and title and not url.info
 277           ll = @registry[channel]
 278           debug ll
 279           if el = ll.find { |u| u.url == url.url }
 280             el.info = title
 281             @registry[channel] = ll
 282           end
 283         end
 284         disp << " --> #{title}" if title
 285       end
 286       m.reply disp, :overlong => :truncate
 287     end
 288   end
 289
 290   def urls(m, params)
 291     channel = params[:channel] ? params[:channel] : m.target
 292     max = params[:limit].to_i
 293     max = 10 if max > 10
 294     max = 1 if max < 1
 295     list = @registry[channel]
 296     if list.empty?
 297       m.reply "no urls seen yet for channel #{channel}"
 298     else
 299       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 300     end
 301   end
 302
 303   def search(m, params)
 304     channel = params[:channel] ? params[:channel] : m.target
 305     max = params[:limit].to_i
 306     string = params[:string]
 307     max = 10 if max > 10
 308     max = 1 if max < 1
 309     regex = Regexp.new(string, Regexp::IGNORECASE)
 310     list = @registry[channel].find_all {|url|
 311       regex.match(url.url) || regex.match(url.nick) ||
 312         (@bot.config['url.info_on_list'] && regex.match(url.info))
 313     }
 314     if list.empty?
 315       m.reply "no matches for channel #{channel}"
 316     else
 317       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 318     end
 319   end
 320 end
 321
 322 plugin = UrlPlugin.new
 323 plugin.map 'urls info *urls', :action => 'info'
 324 plugin.map 'url info *urls', :action => 'info'
 325 plugin.map 'urls search :channel :limit :string', :action => 'search',
 326                           :defaults => {:limit => 4},
 327                           :requirements => {:limit => /^\d+$/},
 328                           :public => false
 329 plugin.map 'urls search :limit :string', :action => 'search',
 330                           :defaults => {:limit => 4},
 331                           :requirements => {:limit => /^\d+$/},
 332                           :private => false
 333 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 334                           :requirements => {:limit => /^\d+$/},
 335                           :public => false
 336 plugin.map 'urls :limit', :defaults => {:limit => 4},
 337                           :requirements => {:limit => /^\d+$/},
 338                           :private => false