From 0423812d31e5c533468a7d4c284932bfec6fcceb Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Sat, 24 Mar 2007 20:54:55 +0000 Subject: [PATCH] httputil and url plugin improvements, see ChangeLog --- ChangeLog | 12 +++ data/rbot/plugins/url.rb | 135 ++++++++++++-------------------- lib/rbot/core/utils/httputil.rb | 109 +++++++++++++++++++++++++- 3 files changed, 170 insertions(+), 86 deletions(-) diff --git a/ChangeLog b/ChangeLog index 94795136..c4b0750b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +2007-03-24 Giuseppe Bilotta + + * HttpUtil: new get_response method to follow redirects and get an + Net::HTTPResponse object with an open connection, allowing partial + body retrieval. + * HttpUtil: extend Net::HTTPResponse with a #partial_body() method. The + method take an argument (max bytes to download) and an optional block; + the method yields all the partial content that it's being downloaded, + instead of the single chunks yield by #read_body(). + * url plugin: use the new HttpUtil methods. + * url plugin: do not block while trying to get link info. + 2007-03-14 Giuseppe Bilotta * Socket filtering: socket data, both input and output, can now be diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 84ee7e43..0d85d473 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -27,89 +27,56 @@ class UrlPlugin < Plugin return unless TITLE_RE.match(pagedata) title = $1.strip.gsub(/\s*\n+\s*/, " ") title = Utils.decode_html_entities title - title = title[0..255] if title.length > 255 - "[Link Info] title: #{title}" + "title: #{title}" end - def read_data_from_response(response, amount) + def get_title_for_url(uri_str) - amount_read = 0 - chunks = [] - - response.read_body do |chunk| # read body now - - amount_read += chunk.length - - # if amount_read > amount - # amount_of_overflow = amount_read - amount - # chunk = chunk[0...-amount_of_overflow] - # end - - chunks << chunk - - break if amount_read >= amount - - end - - chunks.join('') - - end - - def get_title_for_url(uri_str, depth=@bot.config['http.max_redir']) - # This god-awful mess is what the ruby http library has reduced me to. - # Python's HTTP lib is so much nicer. :~( - - if depth == 0 - raise "Error: Maximum redirects hit." - end - - debug "+ Getting #{uri_str.to_s}" url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str) return if url.scheme !~ /https?/ title = nil - debug "+ connecting to #{url.host}:#{url.port}" - http = @bot.httputil.get_proxy(url) - http.start { |http| - - http.request_get(url.request_uri(), @bot.httputil.headers) { |response| - + begin + @bot.httputil.get_response(url) { |response| case response - when Net::HTTPRedirection - # call self recursively if this is a redirect - redirect_to = response['location'] || '/' - debug "+ redirect location: #{redirect_to.inspect}" - url = URI.join(url.to_s, redirect_to) - debug "+ whee, redirecting to #{url.to_s}!" - return get_title_for_url(url, depth-1) - when Net::HTTPSuccess - if response['content-type'] =~ /^text\// - # since the content is 'text/*' and is small enough to - # be a webpage, retrieve the title from the page - debug "+ getting #{url.request_uri}" - # was 5*10^4 ... seems to much to me ... 4k should be enough for everybody ;) - data = read_data_from_response(response, 4096) - return get_title_from_html(data) - else - unless @bot.config['url.titles_only'] - # content doesn't have title, just display info. - size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') - size = size ? ", size: #{size} bytes" : "" - return "[Link Info] type: #{response['content-type']}#{size}" - end - end + when Net::HTTPSuccess + if response['content-type'] =~ /^text\// + # since the content is 'text/*' and is small enough to + # be a webpage, retrieve the title from the page + debug "+ getting #{url.request_uri}" + + # we look for the title in the first 4k bytes + # TODO make the amount of data configurable + response.partial_body(4096) { |part| + title = get_title_from_html(part) + return title if title + } + # if nothing was found, return nothing + return else - return "[Link Info] Error getting link (#{response.code} - #{response.message})" - end # end of "case response" - - } # end of request block - } # end of http start block - - return title + unless @bot.config['url.titles_only'] + # content doesn't have title, just display info. + size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') + size = size ? ", size: #{size} bytes" : "" + return "type: #{response['content-type']}#{size}" + end + end + when Net::HTTPResponse + return "Error getting link (#{response.code} - #{response.message})" + else + raise response + end + } + rescue Object => e + if e.class <= StandardError + error e.inspect + debug e.backtrace.join("\n") + end - rescue SocketError => e - return "[Link Info] Error connecting to site (#{e.message})" + msg = e.respond_to?(:message) ? e.message : e.to_s + return "Error connecting to site (#{e.message})" + end end def listen(m) @@ -122,17 +89,19 @@ class UrlPlugin < Plugin list = @registry[m.target] if @bot.config['url.display_link_info'] - debug "Getting title for #{urlstr}..." - begin - title = get_title_for_url urlstr - if title - m.reply title - debug "Title found!" - else - debug "Title not found!" - end - rescue => e - debug "Failed: #{e}" + Thread.start do + debug "Getting title for #{urlstr}..." + begin + title = get_title_for_url urlstr + if title + m.reply "[Link Info] #{title}" + debug "Title found!" + else + debug "Title not found!" + end + rescue => e + debug "Failed: #{e}" + end end end diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index ff2fb6c6..6ca12d5b 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -10,9 +10,6 @@ # Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta # Copyright:: (C) 2006,2007 Giuseppe Bilotta -module ::Irc -module Utils - require 'resolv' require 'net/http' begin @@ -22,8 +19,30 @@ rescue LoadError => e error "Secured HTTP connections will fail" end +module ::Net + class HTTPResponse + # Read chunks from the body until we have at least _size_ bytes, yielding + # the partial text at each chunk. Return the partial body. + def partial_body(size, &block) + + partial = String.new + + self.read_body { |chunk| + partial << chunk + yield partial + break if size and partial.length >= size + } + + return partial + end + end +end + Net::HTTP.version_1_2 +module ::Irc +module Utils + # class for making http requests easier (mainly for plugins to use) # this class can check the bot proxy configuration to determine if a proxy # needs to be used, which includes support for per-url proxy configuration. @@ -264,6 +283,90 @@ class HttpUtil return nil end + # uri:: uri to query (Uri object or String) + # opts:: options. Currently used: + # :open_timeout:: open timeout for the proxy + # :read_timeout:: read timeout for the proxy + # :cache:: should we cache results? + # + # This method is used to get responses following redirections. + # + # It will return either a Net::HTTPResponse or an error. + # + # If a block is given, it will yield the response or error instead of + # returning it + # + def get_response(uri_or_str, opts={}, &block) + if uri_or_str.kind_of?(URI) + uri = uri_or_str + else + uri = URI.parse(uri_or_str.to_s) + end + debug "Getting #{uri}" + + options = { + :read_timeout => 10, + :open_timeout => 5, + :max_redir => @bot.config["http.max_redir"], + :cache => false, + :yield => :none + }.merge(opts) + + cache = options[:cache] + + proxy = get_proxy(uri) + proxy.open_timeout = options[:open_timeout] + proxy.read_timeout = options[:read_timeout] + + begin + proxy.start() {|http| + req = Net::HTTP::Get.new(uri.request_uri(), @headers) + if uri.user and uri.password + req.basic_auth(uri.user, uri.password) + end + http.request(req) { |resp| + case resp + when Net::HTTPSuccess + if cache + debug "Caching #{uri.to_s}" + cache_response(uri.to_s, resp) + end + when Net::HTTPRedirection + if resp.key?('location') + new_loc = URI.join(uri, resp['location']) rescue URI.parse(resp['location']) + debug "Redirecting #{uri} to #{new_loc}" + if options[:max_redir] > 0 + new_opts = options.dup + new_opts[:max_redir] -= 1 + return get_response(new_loc, new_opts, &block) + else + raise "Too many redirections" + end + end + end + if block_given? + yield resp + else + return resp + end + } + } + rescue StandardError, Timeout::Error => e + error "HttpUtil.get_response exception: #{e.inspect}, while trying to get #{uri}" + debug e.backtrace.join("\n") + def e.body + nil + end + if block_given? + yield e + else + return e + end + end + + raise "This shouldn't happen" + end + def cache_response(k, resp) begin if resp.key?('pragma') and resp['pragma'] == 'no-cache' -- 2.39.2