data/rbot/plugins/url.rb

   1 require 'uri'
   2
   3 Url = Struct.new("Url", :channel, :nick, :time, :url)
   4 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
   5
   6 class UrlPlugin < Plugin
   7   BotConfig.register BotConfigIntegerValue.new('url.max_urls',
   8     :default => 100, :validate => Proc.new{|v| v > 0},
   9     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  10   BotConfig.register BotConfigBooleanValue.new('url.display_link_info',
  11     :default => false,
  12     :desc => "Get the title of any links pasted to the channel and display it (also tells if the link is broken or the site is down)")
  13
  14   def initialize
  15     super
  16     @registry.set_default(Array.new)
  17   end
  18
  19   def help(plugin, topic="")
  20     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  21   end
  22
  23   def get_title_from_html(pagedata)
  24     return unless TITLE_RE.match(pagedata)
  25     title = $1.strip.gsub(/\s*\n+\s*/, " ")
  26     title = Utils.decode_html_entities title
  27     title = title[0..255] if title.length > 255
  28     "[Link Info] title: #{title}"
  29   end
  30
  31   def read_data_from_response(response, amount)
  32
  33     amount_read = 0
  34     chunks = []
  35
  36     response.read_body do |chunk|   # read body now
  37
  38       amount_read += chunk.length
  39
  40       if amount_read > amount
  41         amount_of_overflow = amount_read - amount
  42         chunk = chunk[0...-amount_of_overflow]
  43       end
  44
  45       chunks << chunk
  46
  47       break if amount_read >= amount
  48
  49     end
  50
  51     chunks.join('')
  52
  53   end
  54
  55   def get_title_for_url(uri_str, depth=@bot.config['http.max_redir'])
  56     # This god-awful mess is what the ruby http library has reduced me to.
  57     # Python's HTTP lib is so much nicer. :~(
  58
  59     if depth == 0
  60         raise "Error: Maximum redirects hit."
  61     end
  62
  63     debug "+ Getting #{uri_str.to_s}"
  64     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  65     return if url.scheme !~ /https?/
  66
  67     title = nil
  68
  69     debug "+ connecting to #{url.host}:#{url.port}"
  70     http = @bot.httputil.get_proxy(url)
  71     http.start { |http|
  72
  73       http.request_get(url.request_uri(), @bot.httputil.headers) { |response|
  74
  75         case response
  76           when Net::HTTPRedirection
  77             # call self recursively if this is a redirect
  78             redirect_to = response['location']  || '/'
  79             debug "+ redirect location: #{redirect_to.inspect}"
  80             url = URI.join(url.to_s, redirect_to)
  81             debug "+ whee, redirecting to #{url.to_s}!"
  82             return get_title_for_url(url, depth-1)
  83           when Net::HTTPSuccess
  84             if response['content-type'] =~ /^text\//
  85               # since the content is 'text/*' and is small enough to
  86               # be a webpage, retrieve the title from the page
  87               debug "+ getting #{url.request_uri}"
  88               # was 5*10^4 ... seems to much to me ... 4k should be enough for everybody ;)
  89               data = read_data_from_response(response, 4096)
  90               return get_title_from_html(data)
  91             else
  92               # content doesn't have title, just display info.
  93               size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
  94               size = size ? ", size: #{size} bytes" : ""
  95               return "[Link Info] type: #{response['content-type']}#{size}"
  96             end
  97           else
  98             return "[Link Info] Error getting link (#{response.code} - #{response.message})"
  99           end # end of "case response"
 100
 101       } # end of request block
 102     } # end of http start block
 103
 104     return title
 105
 106   rescue SocketError => e
 107     return "[Link Info] Error connecting to site (#{e.message})"
 108   end
 109
 110   def listen(m)
 111     return unless m.kind_of?(PrivMessage)
 112     return if m.address?
 113     # TODO support multiple urls in one line
 114     if m.message =~ /(f|ht)tps?:\/\//
 115       if m.message =~ /((f|ht)tps?:\/\/.*?)(?:\s+|$)/
 116         urlstr = $1
 117         list = @registry[m.target]
 118
 119         if @bot.config['url.display_link_info']
 120           debug "Getting title for #{urlstr}..."
 121           begin
 122           title = get_title_for_url urlstr
 123           if title
 124             m.reply title
 125             debug "Title found!"
 126           else
 127             debug "Title not found!"
 128           end
 129           rescue => e
 130             debug "Failed: #{e}"
 131           end
 132         end
 133
 134         # check to see if this url is already listed
 135         return if list.find {|u| u.url == urlstr }
 136
 137         url = Url.new(m.target, m.sourcenick, Time.new, urlstr)
 138         debug "#{list.length} urls so far"
 139         if list.length > @bot.config['url.max_urls']
 140           list.pop
 141         end
 142         debug "storing url #{url.url}"
 143         list.unshift url
 144         debug "#{list.length} urls now"
 145         @registry[m.target] = list
 146       end
 147     end
 148   end
 149
 150   def urls(m, params)
 151     channel = params[:channel] ? params[:channel] : m.target
 152     max = params[:limit].to_i
 153     max = 10 if max > 10
 154     max = 1 if max < 1
 155     list = @registry[channel]
 156     if list.empty?
 157       m.reply "no urls seen yet for channel #{channel}"
 158     else
 159       list[0..(max-1)].each do |url|
 160         m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 161       end
 162     end
 163   end
 164
 165   def search(m, params)
 166     channel = params[:channel] ? params[:channel] : m.target
 167     max = params[:limit].to_i
 168     string = params[:string]
 169     max = 10 if max > 10
 170     max = 1 if max < 1
 171     regex = Regexp.new(string, Regexp::IGNORECASE)
 172     list = @registry[channel].find_all {|url|
 173       regex.match(url.url) || regex.match(url.nick)
 174     }
 175     if list.empty?
 176       m.reply "no matches for channel #{channel}"
 177     else
 178       list[0..(max-1)].each do |url|
 179         m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 180       end
 181     end
 182   end
 183 end
 184 plugin = UrlPlugin.new
 185 plugin.map 'urls search :channel :limit :string', :action => 'search',
 186                           :defaults => {:limit => 4},
 187                           :requirements => {:limit => /^\d+$/},
 188                           :public => false
 189 plugin.map 'urls search :limit :string', :action => 'search',
 190                           :defaults => {:limit => 4},
 191                           :requirements => {:limit => /^\d+$/},
 192                           :private => false
 193 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 194                           :requirements => {:limit => /^\d+$/},
 195                           :public => false
 196 plugin.map 'urls :limit', :defaults => {:limit => 4},
 197                           :requirements => {:limit => /^\d+$/},
 198                           :private => false