data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.auto_shorten',
  19     :default => false,
  20     :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
  21   Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
  22     :default => 48,
  23     :desc => "Minimum length of URL to auto-shorten.  Only has an effect when url.auto_shorten is true.")
  24   Config.register Config::BooleanValue.new('url.titles_only',
  25     :default => false,
  26     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  27   Config.register Config::BooleanValue.new('url.first_par',
  28     :default => false,
  29     :desc => "Also try to get the first paragraph of a web page")
  30   Config.register Config::IntegerValue.new('url.first_par_length',
  31     :default => 150,
  32     :desc => "The max length of the first paragraph")
  33   Config.register Config::ArrayValue.new('url.first_par_whitelist',
  34     :default => ['twitter.com'],
  35     :desc => "List of url patterns to show the content for.")
  36   Config.register Config::BooleanValue.new('url.info_on_list',
  37     :default => false,
  38     :desc => "Show link info when listing/searching for urls")
  39   Config.register Config::ArrayValue.new('url.no_info_hosts',
  40     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  41     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  42     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  43   Config.register Config::ArrayValue.new('url.only_on_channels',
  44     :desc => "Show link info only on these channels",
  45     :default => [])
  46   Config.register Config::ArrayValue.new('url.ignore',
  47     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  48     :default => [])
  49
  50   def initialize
  51     super
  52     @registry.set_default(Array.new)
  53     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  54       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  55     end
  56     reset_no_info_hosts
  57     self.filter_group = :htmlinfo
  58     load_filters
  59   end
  60
  61   def reset_no_info_hosts
  62     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  63     debug "no info hosts regexp set to #{@no_info_hosts}"
  64   end
  65
  66   def help(plugin, topic="")
  67     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  68   end
  69
  70   def get_title_from_html(pagedata)
  71     return pagedata.ircify_html_title
  72   end
  73
  74   def get_title_for_url(uri_str, opts = {})
  75
  76     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  77     return if url.scheme !~ /https?/
  78
  79     # also check the ip, the canonical name and the aliases
  80     begin
  81       checks = TCPSocket.gethostbyname(url.host)
  82       checks.delete_at(-2)
  83     rescue => e
  84       return "Unable to retrieve info for #{url.host}: #{e.message}"
  85     end
  86
  87     checks << url.host
  88     checks.flatten!
  89
  90     unless checks.grep(@no_info_hosts).empty?
  91       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  92     end
  93
  94     logopts = opts.dup
  95
  96     title = nil
  97     extra = []
  98
  99     begin
 100       debug "+ getting info for #{url.request_uri}"
 101       info = @bot.filter(:htmlinfo, url)
 102       logopts[:htmlinfo] = info
 103       resp = info[:headers]
 104
 105       logopts[:title] = title = info[:title]
 106
 107       if info[:content]
 108         logopts[:extra] = info[:content]
 109
 110         max_length = @bot.config['url.first_par_length']
 111
 112         whitelist = @bot.config['url.first_par_whitelist']
 113         content = nil
 114         if whitelist.length > 0
 115           whitelist.each do |pattern|
 116             if Regexp.new(pattern, Regexp::IGNORECASE).match(url.to_s)
 117               content = info[:content][0...max_length]
 118               break
 119             end
 120           end
 121         else
 122           content = info[:content][0...max_length]
 123         end
 124
 125         extra << "#{Bold}text#{Bold}: #{content}" if @bot.config['url.first_par'] and content
 126       else
 127         logopts[:extra] = String.new
 128         logopts[:extra] << "Content Type: #{resp['content-type']}"
 129         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 130         if enc = resp['content-encoding']
 131           logopts[:extra] << ", encoding: #{enc}"
 132           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 133         end
 134
 135         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 136         if size
 137           logopts[:extra] << ", size: #{size} bytes"
 138           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 139         end
 140       end
 141     rescue Exception => e
 142       case e
 143       when UrlLinkError
 144         raise e
 145       else
 146         error e
 147         raise "connecting to site/processing information (#{e.message})"
 148       end
 149     end
 150
 151     call_event(:url_added, url.to_s, logopts)
 152     if title
 153       extra.unshift("#{Bold}title#{Bold}: #{title}")
 154     end
 155     return extra.join(", ") if title or not @bot.config['url.titles_only']
 156   end
 157
 158   def handle_urls(m, params={})
 159     opts = {
 160       :display_info => @bot.config['url.display_link_info'],
 161       :channels => @bot.config['url.only_on_channels'],
 162       :ignore => @bot.config['url.ignore']
 163     }.merge params
 164     urls = opts[:urls]
 165     display_info= opts[:display_info]
 166     channels = opts[:channels]
 167     ignore = opts[:ignore]
 168
 169     unless channels.empty?
 170       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 171     end
 172
 173     ignore.each { |u| return if m.source.matches?(u) }
 174
 175     return if urls.empty?
 176     debug "found urls #{urls.inspect}"
 177     list = m.public? ? @registry[m.target] : nil
 178     debug "display link info: #{display_info}"
 179     urls_displayed = 0
 180     urls.each do |urlstr|
 181       debug "working on #{urlstr}"
 182       next unless urlstr =~ /^https?:\/\/./
 183       if @bot.config['url.auto_shorten'] == true and
 184          urlstr.length >= @bot.config['url.auto_shorten_min_length']
 185         m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
 186         next
 187       end
 188       title = nil
 189       debug "Getting title for #{urlstr}..."
 190       reply = nil
 191       begin
 192         title = get_title_for_url(urlstr,
 193                                   :always_reply => m.address?,
 194                                   :nick => m.source.nick,
 195                                   :channel => m.channel,
 196                                   :ircline => m.message)
 197         debug "Title #{title ? '' : 'not '} found"
 198         reply = "#{LINK_INFO} #{title}" if title
 199       rescue => e
 200         debug e
 201         # we might get a 404 because of trailing punctuation, so we try again
 202         # with the last character stripped. this might generate invalid URIs
 203         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 204         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 205           # chop off last non-word character from the unescaped version of
 206           # the URL, and retry if we still have enough string to look like a
 207           # minimal URL
 208           unescaped = URI.unescape(urlstr)
 209           debug "Unescaped: #{unescaped}"
 210           if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
 211             urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
 212             retry
 213           else
 214             debug "Not retrying #{unescaped}"
 215           end
 216         end
 217         reply = "Error #{e.message}"
 218       end
 219
 220       if display_info > urls_displayed
 221         if reply
 222           m.reply reply, :overlong => :truncate, :to => :public,
 223             :nick => (m.address? ? :auto : false)
 224           urls_displayed += 1
 225         end
 226       end
 227
 228       next unless list
 229
 230       # check to see if this url is already listed
 231       next if list.find {|u| u.url == urlstr }
 232
 233       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 234       debug "#{list.length} urls so far"
 235       list.pop if list.length > @bot.config['url.max_urls']
 236       debug "storing url #{url.url}"
 237       list.unshift url
 238       debug "#{list.length} urls now"
 239     end
 240     @registry[m.target] = list
 241   end
 242
 243   def info(m, params)
 244     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 245     urls = URI.extract(escaped)
 246     Thread.new do
 247       handle_urls(m,
 248                   :urls => urls,
 249                   :display_info => params[:urls].length,
 250                   :channels => [])
 251     end
 252   end
 253
 254   def message(m)
 255     return if m.address?
 256
 257     urls = URI.extract(m.message, ['http', 'https'])
 258     return if urls.empty?
 259     Thread.new { handle_urls(m, :urls => urls) }
 260   end
 261
 262   def reply_urls(opts={})
 263     list = opts[:list]
 264     max = opts[:max]
 265     channel = opts[:channel]
 266     m = opts[:msg]
 267     return unless list and max and m
 268     list[0..(max-1)].each do |url|
 269       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 270       if @bot.config['url.info_on_list']
 271         title = url.info ||
 272           get_title_for_url(url.url,
 273                             :nick => url.nick, :channel => channel) rescue nil
 274         # If the url info was missing and we now have some, try to upgrade it
 275         if channel and title and not url.info
 276           ll = @registry[channel]
 277           debug ll
 278           if el = ll.find { |u| u.url == url.url }
 279             el.info = title
 280             @registry[channel] = ll
 281           end
 282         end
 283         disp << " --> #{title}" if title
 284       end
 285       m.reply disp, :overlong => :truncate
 286     end
 287   end
 288
 289   def urls(m, params)
 290     channel = params[:channel] ? params[:channel] : m.target
 291     max = params[:limit].to_i
 292     max = 10 if max > 10
 293     max = 1 if max < 1
 294     list = @registry[channel]
 295     if list.empty?
 296       m.reply "no urls seen yet for channel #{channel}"
 297     else
 298       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 299     end
 300   end
 301
 302   def search(m, params)
 303     channel = params[:channel] ? params[:channel] : m.target
 304     max = params[:limit].to_i
 305     string = params[:string]
 306     max = 10 if max > 10
 307     max = 1 if max < 1
 308     regex = Regexp.new(string, Regexp::IGNORECASE)
 309     list = @registry[channel].find_all {|url|
 310       regex.match(url.url) || regex.match(url.nick) ||
 311         (@bot.config['url.info_on_list'] && regex.match(url.info))
 312     }
 313     if list.empty?
 314       m.reply "no matches for channel #{channel}"
 315     else
 316       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 317     end
 318   end
 319 end
 320
 321 plugin = UrlPlugin.new
 322 plugin.map 'urls info *urls', :action => 'info'
 323 plugin.map 'url info *urls', :action => 'info'
 324 plugin.map 'urls search :channel :limit :string', :action => 'search',
 325                           :defaults => {:limit => 4},
 326                           :requirements => {:limit => /^\d+$/},
 327                           :public => false
 328 plugin.map 'urls search :limit :string', :action => 'search',
 329                           :defaults => {:limit => 4},
 330                           :requirements => {:limit => /^\d+$/},
 331                           :private => false
 332 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 333                           :requirements => {:limit => /^\d+$/},
 334                           :public => false
 335 plugin.map 'urls :limit', :defaults => {:limit => 4},
 336                           :requirements => {:limit => /^\d+$/},
 337                           :private => false