X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=data%2Frbot%2Fplugins%2Furl.rb;h=ca1e6ed86225e6423e865e1602108626771cf343;hb=9d29f400bb3a354779185d61049ce7cdfa7744ee;hp=08fb1f40d487504bfae69c6e837d1999c78e770e;hpb=360bed41ac8ac97419ac40cb971fba94a425ca19;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 08fb1f40..ca1e6ed8 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -1,18 +1,16 @@ -require 'net/http' require 'uri' -require 'cgi' Url = Struct.new("Url", :channel, :nick, :time, :url) -TITLE_RE = /<\s*title\s*>(.+)<\s*\/title\s*>/im +TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im class UrlPlugin < Plugin BotConfig.register BotConfigIntegerValue.new('url.max_urls', :default => 100, :validate => Proc.new{|v| v > 0}, :desc => "Maximum number of urls to store. New urls replace oldest ones.") BotConfig.register BotConfigBooleanValue.new('url.display_link_info', - :default => true, + :default => false, :desc => "Get the title of any links pasted to the channel and display it (also tells if the link is broken or the site is down)") - + def initialize super @registry.set_default(Array.new) @@ -25,56 +23,86 @@ class UrlPlugin < Plugin def get_title_from_html(pagedata) return unless TITLE_RE.match(pagedata) title = $1.strip.gsub(/\s*\n+\s*/, " ") - title = CGI::unescapeHTML title + title = Utils.decode_html_entities title title = title[0..255] if title.length > 255 "[Link Info] title: #{title}" end - def get_title_for_url(uri_str) + def read_data_from_response(response, amount) + + amount_read = 0 + chunks = [] + + response.read_body do |chunk| # read body now + + amount_read += chunk.length + + if amount_read > amount + amount_of_overflow = amount_read - amount + chunk = chunk[0...-amount_of_overflow] + end + + chunks << chunk + + break if amount_read >= amount + + end + + chunks.join('') + + end + + def get_title_for_url(uri_str, depth=@bot.config['http.max_redir']) # This god-awful mess is what the ruby http library has reduced me to. # Python's HTTP lib is so much nicer. :~( - - puts "+ Getting #{uri_str}" - url = URI.parse(uri_str) + + if depth == 0 + raise "Error: Maximum redirects hit." + end + + debug "+ Getting #{uri_str.to_s}" + url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str) return if url.scheme !~ /https?/ - - puts "+ connecting to #{url.host}:#{url.port}" - title = Net::HTTP.start(url.host, url.port) do |http| - url.path = '/' if url.path == '' - head = http.request_head(url.path) - case head - when Net::HTTPRedirection then - # call self recursively if this is a redirect - redirect_to = head['location'] - puts "+ redirect location: #{redirect_to}" - absolute_uris = URI.extract redirect_to - raise "wtf! redirect = #{redirect_to}" if absolute_uris.size > 1 - if absolute_uris.size == 1 - url = URI.parse absolute_uris[0] - else - url.path = redirect_to - end - puts "+ whee, redirect to #{url.to_s}!" - title = get_title_for_url(url.to_s) - when Net::HTTPSuccess then - if head['content-type'] =~ /^text\// - # content is 'text/*' - # retrieve the title from the page - puts "+ getting #{url.path}" - response = http.request_get(url.path) - return get_title_from_html(response.body) + + title = nil + + debug "+ connecting to #{url.host}:#{url.port}" + http = @bot.httputil.get_proxy(url) + http.start { |http| + + http.request_get(url.request_uri(), @bot.httputil.headers) { |response| + + case response + when Net::HTTPRedirection + # call self recursively if this is a redirect + redirect_to = response['location'] || '/' + debug "+ redirect location: #{redirect_to.inspect}" + url = URI.join(url.to_s, redirect_to) + debug "+ whee, redirecting to #{url.to_s}!" + return get_title_for_url(url, depth-1) + when Net::HTTPSuccess + if response['content-type'] =~ /^text\// + # since the content is 'text/*' and is small enough to + # be a webpage, retrieve the title from the page + debug "+ getting #{url.request_uri}" + # was 5*10^4 ... seems to much to me ... 4k should be enough for everybody ;) + data = read_data_from_response(response, 4096) + return get_title_from_html(data) + else + # content doesn't have title, just display info. + size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') + size = size ? ", size: #{size} bytes" : "" + return "[Link Info] type: #{response['content-type']}#{size}" + end else - # content isn't 'text/*'... display info about the file. - size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') - #lastmod = head['last-modified'] - return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}" - end - when Net::HTTPClientError then - return "Error getting link (#{response.code} - #{response.message})" - when Net::HTTPServerError then - return "Error getting link (#{response.code} - #{response.message})" - end - end + return "[Link Info] Error getting link (#{response.code} - #{response.message})" + end # end of "case response" + + } # end of request block + } # end of http start block + + return title + rescue SocketError => e return "[Link Info] Error connecting to site (#{e.message})" end @@ -88,20 +116,24 @@ class UrlPlugin < Plugin urlstr = $1 list = @registry[m.target] - if @bot.config['url.say_titles'] + if @bot.config['url.display_link_info'] debug "Getting title for #{urlstr}..." + begin title = get_title_for_url urlstr if title m.reply title debug "Title found!" else debug "Title not found!" - end + end + rescue => e + debug "Failed: #{e}" + end end - + # check to see if this url is already listed return if list.find {|u| u.url == urlstr } - + url = Url.new(m.target, m.sourcenick, Time.new, urlstr) debug "#{list.length} urls so far" if list.length > @bot.config['url.max_urls'] @@ -136,7 +168,7 @@ class UrlPlugin < Plugin string = params[:string] max = 10 if max > 10 max = 1 if max < 1 - regex = Regexp.new(string) + regex = Regexp.new(string, Regexp::IGNORECASE) list = @registry[channel].find_all {|url| regex.match(url.url) || regex.match(url.nick) }