X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Fhttputil.rb;h=4ea83104ccc51a4d86a1cff0f89afd2995893725;hb=783ffa4235330029d661752b1023db635b26f2b3;hp=3a267da121fc9328ef2898516252b6602ff142ce;hpb=033504340d0c9b28fabd2564b0122edaac2aef2e;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index 3a267da1..4ea83104 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -6,10 +6,6 @@ # Author:: Tom Gilbert # Author:: Giuseppe "Oblomov" Bilotta # Author:: Dmitry "jsn" Kim -# -# Copyright:: (C) 2002-2005 Tom Gilbert -# Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta -# Copyright:: (C) 2007 Giuseppe Bilotta, Dmitry Kim require 'resolv' require 'net/http' @@ -39,7 +35,7 @@ module ::Net charsets = ['latin1'] # should be in config - if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i) + if ctype.match(/charset=["']?([^\s"']+)["']?/i) charsets << $1 debug "charset #{charsets.last} added from header" end @@ -101,7 +97,28 @@ module ::Net end return ret end + when 'deflate' + debug "inflating body" + # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread: + # -MAX_WBITS stops zlib from looking for a zlib header + inflater = Zlib::Inflate.new(-Zlib::MAX_WBITS) + begin + return inflater.inflate(str) + rescue Zlib::Error => e + raise e + # TODO + # debug "full inflation failed (#{e}), trying to recover as much as possible" + end + when /^(?:iso-8859-\d+|windows-\d+|utf-8|utf8)$/i + # B0rked servers (Freshmeat being one of them) sometimes return the charset + # in the content-encoding; in this case we assume that the document has + # a standarc content-encoding + old_hsh = self.to_hash + self['content-type']= self['content-type']+"; charset="+method.downcase + warning "Charset vs content-encoding confusion, trying to recover: from\n#{old_hsh.pretty_inspect}to\n#{self.to_hash.pretty_inspect}" + return str else + debug self.to_hash raise "Unhandled content encoding #{method}" end end @@ -114,14 +131,21 @@ module ::Net # the partial text at each chunk. Return the partial body. def partial_body(size=0, &block) - self.no_cache = true partial = String.new - self.read_body { |chunk| - partial << chunk + if @read + debug "using body() as partial" + partial = self.body yield self.body_to_utf(self.decompress_body(partial)) if block_given? - break if size and size > 0 and partial.length >= size - } + else + debug "disabling cache" + self.no_cache = true + self.read_body { |chunk| + partial << chunk + yield self.body_to_utf(self.decompress_body(partial)) if block_given? + break if size and size > 0 and partial.length >= size + } + end return self.body_to_utf(self.decompress_body(partial)) end @@ -137,6 +161,10 @@ module Utils # this class can check the bot proxy configuration to determine if a proxy # needs to be used, which includes support for per-url proxy configuration. class HttpUtil + Bot::Config.register Bot::Config::IntegerValue.new('http.read_timeout', + :default => 10, :desc => "Default read timeout for HTTP connections") + Bot::Config.register Bot::Config::IntegerValue.new('http.open_timeout', + :default => 20, :desc => "Default open timeout for HTTP connections") Bot::Config.register Bot::Config::BooleanValue.new('http.use_proxy', :default => false, :desc => "should a proxy be used for HTTP requests?") Bot::Config.register Bot::Config::StringValue.new('http.proxy_uri', :default => false, @@ -265,9 +293,9 @@ class HttpUtil @cache = Hash.new @headers = { 'Accept-Charset' => 'utf-8;q=1.0, *;q=0.8', - 'Accept-Encoding' => 'gzip;q=1, identity;q=0.8, *;q=0.2', + 'Accept-Encoding' => 'gzip;q=1, deflate;q=1, identity;q=0.8, *;q=0.2', 'User-Agent' => - "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)" + "rbot http util #{$version} (#{Irc::Bot::SOURCE_URL})" } debug "starting http cache cleanup timer" @timer = @bot.timer.add(300) { @@ -333,8 +361,8 @@ class HttpUtil # def get_proxy(uri, options = {}) opts = { - :read_timeout => 10, - :open_timeout => 20 + :read_timeout => @bot.config["http.read_timeout"], + :open_timeout => @bot.config["http.open_timeout"] }.merge(options) proxy = nil @@ -404,6 +432,22 @@ class HttpUtil undef_method :body alias :body :cooked_body end + unless resp['content-type'] + debug "No content type, guessing" + resp['content-type'] = + case resp['x-rbot-location'] + when /.html?$/i + 'text/html' + when /.xml$/i + 'application/xml' + when /.xhtml$/i + 'application/xml+xhtml' + when /.(gif|png|jpe?g|jp2|tiff?)$/i + "image/#{$1.sub(/^jpg$/,'jpeg').sub(/^tif$/,'tiff')}" + else + 'application/octetstream' + end + end if block_given? yield(resp) else @@ -433,9 +477,19 @@ class HttpUtil # range:: make a ranged request (usually GET). accepts a string # for HTTP/1.1 "Range:" header (i.e. "bytes=0-1000") # body:: request body (usually for POST requests) + # headers:: additional headers to be set for the request. Its value must + # be a Hash in the form { 'Header' => 'value' } # def get_response(uri_or_s, options = {}, &block) # :yields: resp uri = uri_or_s.kind_of?(URI) ? uri_or_s : URI.parse(uri_or_s.to_s) + unless URI::HTTP === uri + if uri.scheme + raise "#{uri.scheme.inspect} URI scheme is not supported" + else + raise "don't know what to do with #{uri.to_s.inspect}" + end + end + opts = { :max_redir => @bot.config['http.max_redir'], :yield => :final, @@ -444,7 +498,6 @@ class HttpUtil }.merge(options) resp = nil - cached = nil req_class = case opts[:method].to_s.downcase.intern when :head, :"net::http::head" @@ -478,7 +531,9 @@ class HttpUtil debug "get_response(#{uri}, #{opts.inspect})" - if opts[:cache] && cached = @cache[cache_key] + cached = @cache[cache_key] + + if opts[:cache] && cached debug "got cached" if !cached.expired? debug "using cached" @@ -491,7 +546,10 @@ class HttpUtil headers['Range'] = opts[:range] if opts[:range] headers['Authorization'] = opts[:auth_head] if opts[:auth_head] - cached.setup_headers(headers) if cached && (req_class == Net::HTTP::Get) + if opts[:cache] && cached && (req_class == Net::HTTP::Get) + cached.setup_headers headers + end + req = req_class.new(uri.request_uri, headers) if uri.user && uri.password req.basic_auth(uri.user, uri.password) @@ -500,22 +558,24 @@ class HttpUtil req.body = opts[:body] if req_class == Net::HTTP::Post debug "prepared request: #{req.to_hash.inspect}" - get_proxy(uri, opts).start do |http| - http.request(req) do |resp| - resp['x-rbot-location'] = uri.to_s - if Net::HTTPNotModified === resp - debug "not modified" - begin - cached.revalidate(resp) - rescue Exception => e - error e + begin + get_proxy(uri, opts).start do |http| + http.request(req) do |resp| + resp['x-rbot-location'] = uri.to_s + if Net::HTTPNotModified === resp + debug "not modified" + begin + cached.revalidate(resp) + rescue Exception => e + error e + end + debug "reusing cached" + resp = cached.response + elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp + debug "http error, deleting cached obj" if cached + @cache.delete(cache_key) end - debug "reusing cached" - resp = cached.response - elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp - debug "http error, deleting cached obj" if cached - @cache.delete(cache_key) - elsif opts[:cache] + begin return handle_response(uri, resp, opts, &block) ensure @@ -524,10 +584,11 @@ class HttpUtil @cache[cache_key] = cached end end - return ret end - return handle_response(uri, resp, opts, &block) end + rescue Exception => e + error e + raise e.message end end @@ -559,8 +620,8 @@ class HttpUtil opts = {:method => :head}.merge(options) begin resp = get_response(uri, opts, &block) - raise "http error #{resp}" if Net::HTTPClientError === resp || - Net::HTTPServerError == resp + # raise "http error #{resp}" if Net::HTTPClientError === resp || + # Net::HTTPServerError == resp return resp rescue Exception => e error e @@ -590,7 +651,7 @@ class HttpUtil # _uri_:: uri to query (URI object or String) # _nbytes_:: number of bytes to get # - # Partia GET request, returns (if possible) the first _nbytes_ bytes of the + # Partial GET request, returns (if possible) the first _nbytes_ bytes of the # response body, following redirs and caching if requested, yielding the # actual response(s) to the optional block. See get_response for details on # the supported _options_