data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.auto_shorten',
  19     :default => false,
  20     :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
  21   Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
  22     :default => 48,
  23     :desc => "Minimum length of URL to auto-shorten.  Only has an effect when url.auto_shorten is true.")
  24   Config.register Config::BooleanValue.new('url.titles_only',
  25     :default => false,
  26     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  27   Config.register Config::BooleanValue.new('url.first_par',
  28     :default => false,
  29     :desc => "Also try to get the first paragraph of a web page")
  30   Config.register Config::BooleanValue.new('url.info_on_list',
  31     :default => false,
  32     :desc => "Show link info when listing/searching for urls")
  33   Config.register Config::ArrayValue.new('url.no_info_hosts',
  34     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  35     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  36     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  37   Config.register Config::ArrayValue.new('url.only_on_channels',
  38     :desc => "Show link info only on these channels",
  39     :default => [])
  40   Config.register Config::ArrayValue.new('url.ignore',
  41     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  42     :default => [])
  43
  44   def initialize
  45     super
  46     @registry.set_default(Array.new)
  47     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  48       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  49     end
  50     reset_no_info_hosts
  51     self.filter_group = :htmlinfo
  52     load_filters
  53   end
  54
  55   def reset_no_info_hosts
  56     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  57     debug "no info hosts regexp set to #{@no_info_hosts}"
  58   end
  59
  60   def help(plugin, topic="")
  61     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  62   end
  63
  64   def get_title_from_html(pagedata)
  65     return pagedata.ircify_html_title
  66   end
  67
  68   def get_title_for_url(uri_str, opts = {})
  69
  70     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  71     return if url.scheme !~ /https?/
  72
  73     # also check the ip, the canonical name and the aliases
  74     begin
  75       checks = TCPSocket.gethostbyname(url.host)
  76       checks.delete_at(-2)
  77     rescue => e
  78       return "Unable to retrieve info for #{url.host}: #{e.message}"
  79     end
  80
  81     checks << url.host
  82     checks.flatten!
  83
  84     unless checks.grep(@no_info_hosts).empty?
  85       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  86     end
  87
  88     logopts = opts.dup
  89
  90     title = nil
  91     extra = []
  92
  93     begin
  94       debug "+ getting info for #{url.request_uri}"
  95       info = @bot.filter(:htmlinfo, url)
  96       debug info
  97       logopts[:htmlinfo] = info
  98       resp = info[:headers]
  99
 100       logopts[:title] = title = info[:title]
 101
 102       if info[:content]
 103         logopts[:extra] = info[:content]
 104         extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
 105       else
 106         logopts[:extra] = String.new
 107         logopts[:extra] << "Content Type: #{resp['content-type']}"
 108         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 109         if enc = resp['content-encoding']
 110           logopts[:extra] << ", encoding: #{enc}"
 111           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 112         end
 113
 114         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 115         if size
 116           logopts[:extra] << ", size: #{size} bytes"
 117           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 118         end
 119       end
 120     rescue Exception => e
 121       case e
 122       when UrlLinkError
 123         raise e
 124       else
 125         error e
 126         raise "connecting to site/processing information (#{e.message})"
 127       end
 128     end
 129
 130     call_event(:url_added, url.to_s, logopts)
 131     if title
 132       extra.unshift("#{Bold}title#{Bold}: #{title}")
 133     end
 134     return extra.join(", ") if title or not @bot.config['url.titles_only']
 135   end
 136
 137   def handle_urls(m, params={})
 138     opts = {
 139       :display_info => @bot.config['url.display_link_info'],
 140       :channels => @bot.config['url.only_on_channels'],
 141       :ignore => @bot.config['url.ignore']
 142     }.merge params
 143     urls = opts[:urls]
 144     display_info= opts[:display_info]
 145     channels = opts[:channels]
 146     ignore = opts[:ignore]
 147
 148     unless channels.empty?
 149       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 150     end
 151
 152     ignore.each { |u| return if m.source.matches?(u) }
 153
 154     return if urls.empty?
 155     debug "found urls #{urls.inspect}"
 156     list = m.public? ? @registry[m.target] : nil
 157     debug "display link info: #{display_info}"
 158     urls_displayed = 0
 159     urls.each do |urlstr|
 160       debug "working on #{urlstr}"
 161       next unless urlstr =~ /^https?:\/\/./
 162       if @bot.config['url.auto_shorten'] == true and
 163          urlstr.length >= @bot.config['url.auto_shorten_min_length']
 164         m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
 165         next
 166       end
 167       title = nil
 168       debug "Getting title for #{urlstr}..."
 169       reply = nil
 170       begin
 171         title = get_title_for_url(urlstr,
 172                                   :always_reply => m.address?,
 173                                   :nick => m.source.nick,
 174                                   :channel => m.channel,
 175                                   :ircline => m.message)
 176         debug "Title #{title ? '' : 'not '} found"
 177         reply = "#{LINK_INFO} #{title}" if title
 178       rescue => e
 179         debug e
 180         # we might get a 404 because of trailing punctuation, so we try again
 181         # with the last character stripped. this might generate invalid URIs
 182         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 183         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 184           # chop off last non-word character from the unescaped version of
 185           # the URL, and retry if we still have enough string to look like a
 186           # minimal URL
 187           unescaped = URI.unescape(urlstr)
 188           debug "Unescaped: #{unescaped}"
 189           if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
 190             urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
 191             retry
 192           else
 193             debug "Not retrying #{unescaped}"
 194           end
 195         end
 196         reply = "Error #{e.message}"
 197       end
 198
 199       if display_info > urls_displayed
 200         if reply
 201           m.reply reply, :overlong => :truncate, :to => :public,
 202             :nick => (m.address? ? :auto : false)
 203           urls_displayed += 1
 204         end
 205       end
 206
 207       next unless list
 208
 209       # check to see if this url is already listed
 210       next if list.find {|u| u.url == urlstr }
 211
 212       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 213       debug "#{list.length} urls so far"
 214       list.pop if list.length > @bot.config['url.max_urls']
 215       debug "storing url #{url.url}"
 216       list.unshift url
 217       debug "#{list.length} urls now"
 218     end
 219     @registry[m.target] = list
 220   end
 221
 222   def info(m, params)
 223     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 224     urls = URI.extract(escaped)
 225     Thread.new do
 226       handle_urls(m,
 227                   :urls => urls,
 228                   :display_info => params[:urls].length,
 229                   :channels => [])
 230     end
 231   end
 232
 233   def message(m)
 234     return if m.address?
 235
 236     escaped = URI.escape(m.message, OUR_UNSAFE)
 237     urls = URI.extract(escaped, ['http', 'https'])
 238     return if urls.empty?
 239     Thread.new { handle_urls(m, :urls => urls) }
 240   end
 241
 242   def reply_urls(opts={})
 243     list = opts[:list]
 244     max = opts[:max]
 245     channel = opts[:channel]
 246     m = opts[:msg]
 247     return unless list and max and m
 248     list[0..(max-1)].each do |url|
 249       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 250       if @bot.config['url.info_on_list']
 251         title = url.info ||
 252           get_title_for_url(url.url,
 253                             :nick => url.nick, :channel => channel) rescue nil
 254         # If the url info was missing and we now have some, try to upgrade it
 255         if channel and title and not url.info
 256           ll = @registry[channel]
 257           debug ll
 258           if el = ll.find { |u| u.url == url.url }
 259             el.info = title
 260             @registry[channel] = ll
 261           end
 262         end
 263         disp << " --> #{title}" if title
 264       end
 265       m.reply disp, :overlong => :truncate
 266     end
 267   end
 268
 269   def urls(m, params)
 270     channel = params[:channel] ? params[:channel] : m.target
 271     max = params[:limit].to_i
 272     max = 10 if max > 10
 273     max = 1 if max < 1
 274     list = @registry[channel]
 275     if list.empty?
 276       m.reply "no urls seen yet for channel #{channel}"
 277     else
 278       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 279     end
 280   end
 281
 282   def search(m, params)
 283     channel = params[:channel] ? params[:channel] : m.target
 284     max = params[:limit].to_i
 285     string = params[:string]
 286     max = 10 if max > 10
 287     max = 1 if max < 1
 288     regex = Regexp.new(string, Regexp::IGNORECASE)
 289     list = @registry[channel].find_all {|url|
 290       regex.match(url.url) || regex.match(url.nick) ||
 291         (@bot.config['url.info_on_list'] && regex.match(url.info))
 292     }
 293     if list.empty?
 294       m.reply "no matches for channel #{channel}"
 295     else
 296       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 297     end
 298   end
 299 end
 300
 301 plugin = UrlPlugin.new
 302 plugin.map 'urls info *urls', :action => 'info'
 303 plugin.map 'url info *urls', :action => 'info'
 304 plugin.map 'urls search :channel :limit :string', :action => 'search',
 305                           :defaults => {:limit => 4},
 306                           :requirements => {:limit => /^\d+$/},
 307                           :public => false
 308 plugin.map 'urls search :limit :string', :action => 'search',
 309                           :defaults => {:limit => 4},
 310                           :requirements => {:limit => /^\d+$/},
 311                           :private => false
 312 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 313                           :requirements => {:limit => /^\d+$/},
 314                           :public => false
 315 plugin.map 'urls :limit', :defaults => {:limit => 4},
 316                           :requirements => {:limit => /^\d+$/},
 317                           :private => false