data/rbot/plugins/url.rb

   1 require 'net/http'
   2 require 'uri'
   3 require 'cgi'
   4
   5 Url = Struct.new("Url", :channel, :nick, :time, :url)
   6 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
   7
   8 UNESCAPE_TABLE = {
   9     'raquo' => '>>',
  10     '#8220' => '"',
  11     '#8221' => '"',
  12     '#8212' => '--',
  13     '#39' => '\'',
  14     '#174' => '(R)',
  15     'micro' => 'u',
  16     '' => '',
  17     '' => '',
  18     '' => '',
  19     '' => '',
  20     #'' => '',
  21 }
  22
  23 class UrlPlugin < Plugin
  24   BotConfig.register BotConfigIntegerValue.new('url.max_urls',
  25     :default => 100, :validate => Proc.new{|v| v > 0},
  26     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  27   BotConfig.register BotConfigBooleanValue.new('url.display_link_info',
  28     :default => true,
  29     :desc => "Get the title of any links pasted to the channel and display it (also tells if the link is broken or the site is down)")
  30
  31   def initialize
  32     super
  33     @registry.set_default(Array.new)
  34   end
  35
  36   def help(plugin, topic="")
  37     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  38   end
  39
  40   def unescape_title(htmldata)
  41     # first pass -- let CGI try to attack it...
  42     htmldata = CGI::unescapeHTML htmldata
  43
  44     # second pass -- destroy the remaining bits...
  45     htmldata.gsub(/(&(.+?);)/) {
  46         symbol = $2
  47
  48         # remove the 0-paddng from unicode integers
  49         if symbol =~ /#(.+)/
  50             symbol = "##{$1.to_i.to_s}"
  51         end
  52
  53         # output the symbol's irc-translated character, or a * if it's unknown
  54         UNESCAPE_TABLE[symbol] || '*'
  55     }
  56   end
  57
  58   def get_title_from_html(pagedata)
  59     return unless TITLE_RE.match(pagedata)
  60     title = $1.strip.gsub(/\s*\n+\s*/, " ")
  61     title = unescape_title title
  62     title = title[0..255] if title.length > 255
  63     "[Link Info] title: #{title}"
  64   end
  65
  66   def get_title_for_url(uri_str, depth=10)
  67     # This god-awful mess is what the ruby http library has reduced me to.
  68     # Python's HTTP lib is so much nicer. :~(
  69
  70     if depth == 0
  71         raise "Error: Maximum redirects hit."
  72     end
  73
  74     puts "+ Getting #{uri_str}"
  75     url = URI.parse(uri_str)
  76     return if url.scheme !~ /https?/
  77
  78     puts "+ connecting to #{url.host}:#{url.port}"
  79     http = @bot.httputil.get_proxy(url)
  80     title = http.start do |http|
  81       url.path = '/' if url.path == ''
  82       head = http.request_head(url.path)
  83       case head
  84         when Net::HTTPRedirection then
  85           # call self recursively if this is a redirect
  86           redirect_to = head['location']
  87           puts "+ redirect location: #{redirect_to}"
  88           url = URI.join url.to_s, redirect_to
  89           puts "+ whee, redirecting to #{url.to_s}!"
  90           title = get_title_for_url(url.to_s, depth-1)
  91         when Net::HTTPSuccess then
  92           if head['content-type'] =~ /^text\// and (not head['content-length'] or head['content-length'].to_i < 400000)
  93             # since the content is 'text/*' and is small enough to
  94             # be a webpage, retrieve the title from the page
  95             puts "+ getting #{url.request_uri}"
  96             response = http.request_get(url.request_uri)
  97             return get_title_from_html(response.body)
  98           else
  99             # content doesn't have title, just display info.
 100             size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
 101             #lastmod = head['last-modified']
 102             return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}"
 103           end
 104         when Net::HTTPClientError then
 105           return "[Link Info] Error getting link (#{head.code} - #{head.message})"
 106         when Net::HTTPServerError then
 107           return "[Link Info] Error getting link (#{head.code} - #{head.message})"
 108       end
 109     end
 110   rescue SocketError => e
 111     return "[Link Info] Error connecting to site (#{e.message})"
 112   end
 113
 114   def listen(m)
 115     return unless m.kind_of?(PrivMessage)
 116     return if m.address?
 117     # TODO support multiple urls in one line
 118     if m.message =~ /(f|ht)tps?:\/\//
 119       if m.message =~ /((f|ht)tps?:\/\/.*?)(?:\s+|$)/
 120         urlstr = $1
 121         list = @registry[m.target]
 122
 123         if @bot.config['url.display_link_info']
 124           debug "Getting title for #{urlstr}..."
 125           title = get_title_for_url urlstr
 126           if title
 127             m.reply title
 128             debug "Title found!"
 129           else
 130             debug "Title not found!"
 131           end
 132         end
 133
 134         # check to see if this url is already listed
 135         return if list.find {|u| u.url == urlstr }
 136
 137         url = Url.new(m.target, m.sourcenick, Time.new, urlstr)
 138         debug "#{list.length} urls so far"
 139         if list.length > @bot.config['url.max_urls']
 140           list.pop
 141         end
 142         debug "storing url #{url.url}"
 143         list.unshift url
 144         debug "#{list.length} urls now"
 145         @registry[m.target] = list
 146       end
 147     end
 148   end
 149
 150   def urls(m, params)
 151     channel = params[:channel] ? params[:channel] : m.target
 152     max = params[:limit].to_i
 153     max = 10 if max > 10
 154     max = 1 if max < 1
 155     list = @registry[channel]
 156     if list.empty?
 157       m.reply "no urls seen yet for channel #{channel}"
 158     else
 159       list[0..(max-1)].each do |url|
 160         m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 161       end
 162     end
 163   end
 164
 165   def search(m, params)
 166     channel = params[:channel] ? params[:channel] : m.target
 167     max = params[:limit].to_i
 168     string = params[:string]
 169     max = 10 if max > 10
 170     max = 1 if max < 1
 171     regex = Regexp.new(string, Regexp::IGNORECASE)
 172     list = @registry[channel].find_all {|url|
 173       regex.match(url.url) || regex.match(url.nick)
 174     }
 175     if list.empty?
 176       m.reply "no matches for channel #{channel}"
 177     else
 178       list[0..(max-1)].each do |url|
 179         m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 180       end
 181     end
 182   end
 183 end
 184 plugin = UrlPlugin.new
 185 plugin.map 'urls search :channel :limit :string', :action => 'search',
 186                           :defaults => {:limit => 4},
 187                           :requirements => {:limit => /^\d+$/},
 188                           :public => false
 189 plugin.map 'urls search :limit :string', :action => 'search',
 190                           :defaults => {:limit => 4},
 191                           :requirements => {:limit => /^\d+$/},
 192                           :private => false
 193 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 194                           :requirements => {:limit => /^\d+$/},
 195                           :public => false
 196 plugin.map 'urls :limit', :defaults => {:limit => 4},
 197                           :requirements => {:limit => /^\d+$/},
 198                           :private => false