data/rbot/plugins/url.rb

   1 require 'net/http'
   2 require 'uri'
   3 require 'cgi'
   4
   5 Url = Struct.new("Url", :channel, :nick, :time, :url)
   6 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
   7
   8 class UrlPlugin < Plugin
   9   BotConfig.register BotConfigIntegerValue.new('url.max_urls',
  10     :default => 100, :validate => Proc.new{|v| v > 0},
  11     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  12   BotConfig.register BotConfigBooleanValue.new('url.display_link_info',
  13     :default => true,
  14     :desc => "Get the title of any links pasted to the channel and display it (also tells if the link is broken or the site is down)")
  15
  16   def initialize
  17     super
  18     @registry.set_default(Array.new)
  19   end
  20
  21   def help(plugin, topic="")
  22     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  23   end
  24
  25   def get_title_from_html(pagedata)
  26     return unless TITLE_RE.match(pagedata)
  27     title = $1.strip.gsub(/\s*\n+\s*/, " ")
  28     title = CGI::unescapeHTML title
  29     title = title[0..255] if title.length > 255
  30     "[Link Info] title: #{title}"
  31   end
  32
  33   def get_title_for_url(uri_str, depth=10)
  34     # This god-awful mess is what the ruby http library has reduced me to.
  35     # Python's HTTP lib is so much nicer. :~(
  36
  37     if depth == 0
  38         raise "Error: Maximum redirects hit."
  39     end
  40
  41     puts "+ Getting #{uri_str}"
  42     url = URI.parse(uri_str)
  43     return if url.scheme !~ /https?/
  44
  45     puts "+ connecting to #{url.host}:#{url.port}"
  46     http = @bot.httputil.get_proxy(url)
  47     title = http.start do |http|
  48       url.path = '/' if url.path == ''
  49       head = http.request_head(url.path)
  50       case head
  51         when Net::HTTPRedirection then
  52           # call self recursively if this is a redirect
  53           redirect_to = head['location']
  54           puts "+ redirect location: #{redirect_to}"
  55           url = URI.join url.to_s, redirect_to
  56           puts "+ whee, redirecting to #{url.to_s}!"
  57           title = get_title_for_url(url.to_s, depth-1)
  58         when Net::HTTPSuccess then
  59           if head['content-type'] =~ /^text\// and (not head['content-length'] or head['content-length'].to_i < 400000)
  60             # since the content is 'text/*' and is small enough to
  61             # be a webpage, retrieve the title from the page
  62             puts "+ getting #{url.request_uri}"
  63             response = http.request_get(url.request_uri)
  64             return get_title_from_html(response.body)
  65           else
  66             # content doesn't have title, just display info.
  67             size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
  68             #lastmod = head['last-modified']
  69             return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}"
  70           end
  71         when Net::HTTPClientError then
  72           return "[Link Info] Error getting link (#{head.code} - #{head.message})"
  73         when Net::HTTPServerError then
  74           return "[Link Info] Error getting link (#{head.code} - #{head.message})"
  75       end
  76     end
  77   rescue SocketError => e
  78     return "[Link Info] Error connecting to site (#{e.message})"
  79   end
  80
  81   def listen(m)
  82     return unless m.kind_of?(PrivMessage)
  83     return if m.address?
  84     # TODO support multiple urls in one line
  85     if m.message =~ /(f|ht)tps?:\/\//
  86       if m.message =~ /((f|ht)tps?:\/\/.*?)(?:\s+|$)/
  87         urlstr = $1
  88         list = @registry[m.target]
  89
  90         if @bot.config['url.display_link_info']
  91           debug "Getting title for #{urlstr}..."
  92           title = get_title_for_url urlstr
  93           if title
  94             m.reply title
  95             debug "Title found!"
  96           else
  97             debug "Title not found!"
  98           end
  99         end
 100
 101         # check to see if this url is already listed
 102         return if list.find {|u| u.url == urlstr }
 103
 104         url = Url.new(m.target, m.sourcenick, Time.new, urlstr)
 105         debug "#{list.length} urls so far"
 106         if list.length > @bot.config['url.max_urls']
 107           list.pop
 108         end
 109         debug "storing url #{url.url}"
 110         list.unshift url
 111         debug "#{list.length} urls now"
 112         @registry[m.target] = list
 113       end
 114     end
 115   end
 116
 117   def urls(m, params)
 118     channel = params[:channel] ? params[:channel] : m.target
 119     max = params[:limit].to_i
 120     max = 10 if max > 10
 121     max = 1 if max < 1
 122     list = @registry[channel]
 123     if list.empty?
 124       m.reply "no urls seen yet for channel #{channel}"
 125     else
 126       list[0..(max-1)].each do |url|
 127         m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 128       end
 129     end
 130   end
 131
 132   def search(m, params)
 133     channel = params[:channel] ? params[:channel] : m.target
 134     max = params[:limit].to_i
 135     string = params[:string]
 136     max = 10 if max > 10
 137     max = 1 if max < 1
 138     regex = Regexp.new(string, Regexp::IGNORECASE)
 139     list = @registry[channel].find_all {|url|
 140       regex.match(url.url) || regex.match(url.nick)
 141     }
 142     if list.empty?
 143       m.reply "no matches for channel #{channel}"
 144     else
 145       list[0..(max-1)].each do |url|
 146         m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 147       end
 148     end
 149   end
 150 end
 151 plugin = UrlPlugin.new
 152 plugin.map 'urls search :channel :limit :string', :action => 'search',
 153                           :defaults => {:limit => 4},
 154                           :requirements => {:limit => /^\d+$/},
 155                           :public => false
 156 plugin.map 'urls search :limit :string', :action => 'search',
 157                           :defaults => {:limit => 4},
 158                           :requirements => {:limit => /^\d+$/},
 159                           :private => false
 160 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 161                           :requirements => {:limit => /^\d+$/},
 162                           :public => false
 163 plugin.map 'urls :limit', :defaults => {:limit => 4},
 164                           :requirements => {:limit => /^\d+$/},
 165                           :private => false