data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.titles_only',
  19     :default => false,
  20     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  21   Config.register Config::BooleanValue.new('url.first_par',
  22     :default => false,
  23     :desc => "Also try to get the first paragraph of a web page")
  24   Config.register Config::BooleanValue.new('url.info_on_list',
  25     :default => false,
  26     :desc => "Show link info when listing/searching for urls")
  27   Config.register Config::ArrayValue.new('url.no_info_hosts',
  28     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  29     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  30     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  31
  32
  33   def initialize
  34     super
  35     @registry.set_default(Array.new)
  36     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  37       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  38     end
  39     reset_no_info_hosts
  40   end
  41
  42   def reset_no_info_hosts
  43     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  44     debug "no info hosts regexp set to #{@no_info_hosts}"
  45   end
  46
  47   def help(plugin, topic="")
  48     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  49   end
  50
  51   def get_title_from_html(pagedata)
  52     return pagedata.ircify_html_title
  53   end
  54
  55   def get_title_for_url(uri_str, opts = {})
  56
  57     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  58     return if url.scheme !~ /https?/
  59
  60     # also check the ip, the canonical name and the aliases
  61     begin
  62       checks = TCPSocket.gethostbyname(url.host)
  63       checks.delete_at(-2)
  64     rescue => e
  65       return "Unable to retrieve info for #{url.host}: #{e.message}"
  66     end
  67
  68     checks << url.host
  69     checks.flatten!
  70
  71     unless checks.grep(@no_info_hosts).empty?
  72       return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
  73     end
  74
  75     logopts = opts.dup
  76
  77     title = nil
  78     extra = []
  79
  80     begin
  81       debug "+ getting info for #{url.request_uri}"
  82       info = @bot.filter(:htmlinfo, url)
  83       debug info
  84       resp = info[:headers]
  85
  86       logopts[:title] = title = info[:title]
  87
  88       if info[:content]
  89         logopts[:extra] = info[:content]
  90         extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
  91       else
  92         logopts[:extra] = String.new
  93         logopts[:extra] << "Content Type: #{resp['content-type']}"
  94         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
  95         if enc = resp['content-encoding']
  96           logopts[:extra] << ", encoding: #{enc}"
  97           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
  98         end
  99
 100         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 101         if size
 102           logopts[:extra] << ", size: #{size} bytes"
 103           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 104         end
 105       end
 106     rescue Exception => e
 107       case e
 108       when UrlLinkError
 109         raise e
 110       else
 111         error e
 112         raise "connecting to site/processing information (#{e.message})"
 113       end
 114     end
 115
 116     call_event(:url_added, url.to_s, logopts)
 117     if title
 118       extra.unshift("#{Bold}title#{Bold}: #{title}")
 119     end
 120     return extra.join(", ") if title or not @bot.config['url.titles_only']
 121   end
 122
 123   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
 124     return if urls.empty?
 125     debug "found urls #{urls.inspect}"
 126     list = m.public? ? @registry[m.target] : nil
 127     debug "display link info: #{display_info}"
 128     urls_displayed = 0
 129     urls.each do |urlstr|
 130       debug "working on #{urlstr}"
 131       next unless urlstr =~ /^https?:/
 132       title = nil
 133       debug "Getting title for #{urlstr}..."
 134       reply = nil
 135       begin
 136         title = get_title_for_url(urlstr,
 137                                   :nick => m.source.nick,
 138                                   :channel => m.channel,
 139                                   :ircline => m.message)
 140         debug "Title #{title ? '' : 'not '} found"
 141         reply = "#{LINK_INFO} #{title}" if title
 142       rescue => e
 143         debug e
 144         # we might get a 404 because of trailing punctuation, so we try again
 145         # with the last character stripped. this might generate invalid URIs
 146         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 147         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 148           # chop off last character, and retry if we still have enough string to
 149           # look like a minimal URL
 150           retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
 151         end
 152         reply = "Error #{e.message}"
 153       end
 154
 155       if display_info > urls_displayed
 156         if reply
 157           m.plainreply(reply, :overlong => :truncate)
 158           urls_displayed += 1
 159         end
 160       end
 161
 162       next unless list
 163
 164       # check to see if this url is already listed
 165       next if list.find {|u| u.url == urlstr }
 166
 167       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 168       debug "#{list.length} urls so far"
 169       list.pop if list.length > @bot.config['url.max_urls']
 170       debug "storing url #{url.url}"
 171       list.unshift url
 172       debug "#{list.length} urls now"
 173     end
 174     @registry[m.target] = list
 175   end
 176
 177   def info(m, params)
 178     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 179     urls = URI.extract(escaped)
 180     Thread.new { handle_urls(m, urls, params[:urls].length) }
 181   end
 182
 183   def listen(m)
 184     return unless m.kind_of?(PrivMessage)
 185     return if m.address?
 186
 187     escaped = URI.escape(m.message, OUR_UNSAFE)
 188     urls = URI.extract(escaped, ['http', 'https'])
 189     return if urls.empty?
 190     Thread.new { handle_urls(m, urls) }
 191   end
 192
 193   def reply_urls(opts={})
 194     list = opts[:list]
 195     max = opts[:max]
 196     channel = opts[:channel]
 197     m = opts[:msg]
 198     return unless list and max and m
 199     list[0..(max-1)].each do |url|
 200       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 201       if @bot.config['url.info_on_list']
 202         title = url.info ||
 203           get_title_for_url(url.url,
 204                             :nick => url.nick, :channel => channel) rescue nil
 205         # If the url info was missing and we now have some, try to upgrade it
 206         if channel and title and not url.info
 207           ll = @registry[channel]
 208           debug ll
 209           if el = ll.find { |u| u.url == url.url }
 210             el.info = title
 211             @registry[channel] = ll
 212           end
 213         end
 214         disp << " --> #{title}" if title
 215       end
 216       m.reply disp, :overlong => :truncate
 217     end
 218   end
 219
 220   def urls(m, params)
 221     channel = params[:channel] ? params[:channel] : m.target
 222     max = params[:limit].to_i
 223     max = 10 if max > 10
 224     max = 1 if max < 1
 225     list = @registry[channel]
 226     if list.empty?
 227       m.reply "no urls seen yet for channel #{channel}"
 228     else
 229       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 230     end
 231   end
 232
 233   def search(m, params)
 234     channel = params[:channel] ? params[:channel] : m.target
 235     max = params[:limit].to_i
 236     string = params[:string]
 237     max = 10 if max > 10
 238     max = 1 if max < 1
 239     regex = Regexp.new(string, Regexp::IGNORECASE)
 240     list = @registry[channel].find_all {|url|
 241       regex.match(url.url) || regex.match(url.nick) ||
 242         (@bot.config['url.info_on_list'] && regex.match(url.info))
 243     }
 244     if list.empty?
 245       m.reply "no matches for channel #{channel}"
 246     else
 247       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 248     end
 249   end
 250 end
 251
 252 plugin = UrlPlugin.new
 253 plugin.map 'urls info *urls', :action => 'info'
 254 plugin.map 'url info *urls', :action => 'info'
 255 plugin.map 'urls search :channel :limit :string', :action => 'search',
 256                           :defaults => {:limit => 4},
 257                           :requirements => {:limit => /^\d+$/},
 258                           :public => false
 259 plugin.map 'urls search :limit :string', :action => 'search',
 260                           :defaults => {:limit => 4},
 261                           :requirements => {:limit => /^\d+$/},
 262                           :private => false
 263 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 264                           :requirements => {:limit => /^\d+$/},
 265                           :public => false
 266 plugin.map 'urls :limit', :defaults => {:limit => 4},
 267                           :requirements => {:limit => /^\d+$/},
 268                           :private => false