X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Fhttputil.rb;h=3d4133e0a80a2b52ebd0878072272d9db94717e6;hb=41e46ac67664f3bbec427d4e332a89783073e856;hp=476a71c18810f67a99d7afea56a8a69770f2c2f8;hpb=4e3660831d7f4fbfe58341e9ce95bef620f13d6b;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index 476a71c1..3d4133e0 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -13,17 +13,22 @@ require 'resolv' require 'net/http' +require 'cgi' require 'iconv' begin require 'net/https' rescue LoadError => e - error "Couldn't load 'net/https': #{e.inspect}" + error "Couldn't load 'net/https': #{e.pretty_inspect}" error "Secured HTTP connections will fail" end -module ::Net - class HTTPResponse - attr_accessor :no_cache +# To handle Gzipped pages +require 'stringio' +require 'zlib' + +module ::Net + class HTTPResponse + attr_accessor :no_cache if !instance_methods.include?('raw_body') alias :raw_body :body end @@ -34,7 +39,7 @@ module ::Net charsets = ['latin1'] # should be in config - if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i) + if ctype.match(/charset=["']?([^\s"']+)["']?/i) charsets << $1 debug "charset #{charsets.last} added from header" end @@ -56,36 +61,83 @@ module ::Net def body_to_utf(str) charsets = self.body_charset(str) or return str - charsets.reverse_each { |charset| - begin - return Iconv.iconv('utf-8//ignore', charset, str).first - rescue - debug "conversion failed for #{charset}" + charsets.reverse_each do |charset| + # XXX: this one is really ugly, but i don't know how to make it better + # -jsn + + 0.upto(5) do |off| + begin + debug "trying #{charset} / offset #{off}" + return Iconv.iconv('utf-8//ignore', + charset, + str.slice(0 .. (-1 - off))).first + rescue + debug "conversion failed for #{charset} / offset #{off}" + end end - } + end return str end - def body - return self.body_to_utf(self.raw_body) + def decompress_body(str) + method = self['content-encoding'] + case method + when nil + return str + when /gzip/ # Matches gzip, x-gzip, and the non-rfc-compliant gzip;q=\d sent by some servers + debug "gunzipping body" + begin + return Zlib::GzipReader.new(StringIO.new(str)).read + rescue Zlib::Error => e + # If we can't unpack the whole stream (e.g. because we're doing a + # partial read + debug "full gunzipping failed (#{e}), trying to recover as much as possible" + ret = "" + begin + Zlib::GzipReader.new(StringIO.new(str)).each_byte { |byte| + ret << byte + } + rescue + end + return ret + end + when 'deflate' + debug "inflating body" + # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread: + # -MAX_WBITS stops zlib from looking for a zlib header + inflater = Zlib::Inflate.new(-Zlib::MAX_WBITS) + begin + return inflater.inflate(str) + rescue Zlib::Error => e + raise e + # TODO + # debug "full inflation failed (#{e}), trying to recover as much as possible" + end + else + raise "Unhandled content encoding #{method}" + end + end + + def cooked_body + return self.body_to_utf(self.decompress_body(self.raw_body)) end - # Read chunks from the body until we have at least _size_ bytes, yielding - # the partial text at each chunk. Return the partial body. - def partial_body(size=0, &block) + # Read chunks from the body until we have at least _size_ bytes, yielding + # the partial text at each chunk. Return the partial body. + def partial_body(size=0, &block) self.no_cache = true - partial = String.new + partial = String.new - self.read_body { |chunk| - partial << chunk - yield self.body_to_utf(partial) if block_given? - break if size and size > 0 and partial.length >= size - } + self.read_body { |chunk| + partial << chunk + yield self.body_to_utf(self.decompress_body(partial)) if block_given? + break if size and size > 0 and partial.length >= size + } - return self.body_to_utf(partial) - end - end + return self.body_to_utf(self.decompress_body(partial)) + end + end end Net::HTTP.version_1_2 @@ -97,35 +149,39 @@ module Utils # this class can check the bot proxy configuration to determine if a proxy # needs to be used, which includes support for per-url proxy configuration. class HttpUtil - BotConfig.register BotConfigBooleanValue.new('http.use_proxy', + Bot::Config.register Bot::Config::IntegerValue.new('http.read_timeout', + :default => 10, :desc => "Default read timeout for HTTP connections") + Bot::Config.register Bot::Config::IntegerValue.new('http.open_timeout', + :default => 20, :desc => "Default open timeout for HTTP connections") + Bot::Config.register Bot::Config::BooleanValue.new('http.use_proxy', :default => false, :desc => "should a proxy be used for HTTP requests?") - BotConfig.register BotConfigStringValue.new('http.proxy_uri', :default => false, + Bot::Config.register Bot::Config::StringValue.new('http.proxy_uri', :default => false, :desc => "Proxy server to use for HTTP requests (URI, e.g http://proxy.host:port)") - BotConfig.register BotConfigStringValue.new('http.proxy_user', + Bot::Config.register Bot::Config::StringValue.new('http.proxy_user', :default => nil, :desc => "User for authenticating with the http proxy (if required)") - BotConfig.register BotConfigStringValue.new('http.proxy_pass', + Bot::Config.register Bot::Config::StringValue.new('http.proxy_pass', :default => nil, :desc => "Password for authenticating with the http proxy (if required)") - BotConfig.register BotConfigArrayValue.new('http.proxy_include', + Bot::Config.register Bot::Config::ArrayValue.new('http.proxy_include', :default => [], :desc => "List of regexps to check against a URI's hostname/ip to see if we should use the proxy to access this URI. All URIs are proxied by default if the proxy is set, so this is only required to re-include URIs that might have been excluded by the exclude list. e.g. exclude /.*\.foo\.com/, include bar\.foo\.com") - BotConfig.register BotConfigArrayValue.new('http.proxy_exclude', + Bot::Config.register Bot::Config::ArrayValue.new('http.proxy_exclude', :default => [], :desc => "List of regexps to check against a URI's hostname/ip to see if we should use avoid the proxy to access this URI and access it directly") - BotConfig.register BotConfigIntegerValue.new('http.max_redir', + Bot::Config.register Bot::Config::IntegerValue.new('http.max_redir', :default => 5, :desc => "Maximum number of redirections to be used when getting a document") - BotConfig.register BotConfigIntegerValue.new('http.expire_time', + Bot::Config.register Bot::Config::IntegerValue.new('http.expire_time', :default => 60, :desc => "After how many minutes since last use a cached document is considered to be expired") - BotConfig.register BotConfigIntegerValue.new('http.max_cache_time', + Bot::Config.register Bot::Config::IntegerValue.new('http.max_cache_time', :default => 60*24, :desc => "After how many minutes since first use a cached document is considered to be expired") - BotConfig.register BotConfigIntegerValue.new('http.no_expire_cache', + Bot::Config.register Bot::Config::IntegerValue.new('http.no_expire_cache', :default => false, :desc => "Set this to true if you want the bot to never expire the cached pages") - BotConfig.register BotConfigIntegerValue.new('http.info_bytes', + Bot::Config.register Bot::Config::IntegerValue.new('http.info_bytes', :default => 8192, :desc => "How many bytes to download from a web page to find some information. Set to 0 to let the bot download the whole page.") @@ -212,36 +268,41 @@ class HttpUtil self.revalidate self.response.raw_body rescue Exception => e - error e.message - error e.backtrace.join("\n") + error e raise e end end end + # Create the HttpUtil instance, associating it with Bot _bot_ + # def initialize(bot) @bot = bot @cache = Hash.new @headers = { 'Accept-Charset' => 'utf-8;q=1.0, *;q=0.8', + 'Accept-Encoding' => 'gzip;q=1, deflate;q=1, identity;q=0.8, *;q=0.2', 'User-Agent' => - "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)" - } + "rbot http util #{$version} (#{Irc::Bot::SOURCE_URL})" + } debug "starting http cache cleanup timer" @timer = @bot.timer.add(300) { self.remove_stale_cache unless @bot.config['http.no_expire_cache'] } - end + end + # Clean up on HttpUtil unloading, by stopping the cache cleanup timer. def cleanup debug 'stopping http cache cleanup timer' @bot.timer.remove(@timer) end - # if http_proxy_include or http_proxy_exclude are set, then examine the - # uri to see if this is a proxied uri - # the in/excludes are a list of regexps, and each regexp is checked against - # the server name, and its IP addresses + # This method checks if a proxy is required to access _uri_, by looking at + # the values of config values +http.proxy_include+ and +http.proxy_exclude+. + # + # Each of these config values, if set, should be a Regexp the server name and + # IP address should be checked against. + # def proxy_required(uri) use_proxy = true if @bot.config["http.proxy_exclude"].empty? && @bot.config["http.proxy_include"].empty? @@ -281,17 +342,15 @@ class HttpUtil return use_proxy end - # uri:: Uri to create a proxy for + # _uri_:: URI to create a proxy for + # + # Return a net/http Proxy object, configured for proxying based on the + # bot's proxy configuration. See proxy_required for more details on this. # - # return a net/http Proxy object, which is configured correctly for - # proxying based on the bot's proxy configuration. - # This will include per-url proxy configuration based on the bot config - # +http_proxy_include/exclude+ options. - def get_proxy(uri, options = {}) opts = { - :read_timeout => 10, - :open_timeout => 5 + :read_timeout => @bot.config["http.read_timeout"], + :open_timeout => @bot.config["http.open_timeout"] }.merge(options) proxy = nil @@ -326,7 +385,14 @@ class HttpUtil return h end - def handle_response(uri, resp, opts, &block) + # Internal method used to hanlde response _resp_ received when making a + # request for URI _uri_. + # + # It follows redirects, optionally yielding them if option :yield is :all. + # + # Also yields and returns the final _resp_. + # + def handle_response(uri, resp, opts, &block) # :yields: resp if Net::HTTPRedirection === resp && opts[:max_redir] >= 0 if resp.key?('location') raise 'Too many redirections' if opts[:max_redir] <= 0 @@ -339,12 +405,37 @@ class HttpUtil when :post, :"net::http::post" new_opts[:method] = :get end + if resp['set-cookie'] + debug "setting cookie #{resp['set-cookie']}" + new_opts[:headers] ||= Hash.new + new_opts[:headers]['Cookie'] = resp['set-cookie'] + end debug "following the redirect to #{new_loc}" return get_response(new_loc, new_opts, &block) else warning ":| redirect w/o location?" end end + class << resp + undef_method :body + alias :body :cooked_body + end + unless resp['content-type'] + debug "No content type, guessing" + resp['content-type'] = + case resp['x-rbot-location'] + when /.html?$/i + 'text/html' + when /.xml$/i + 'application/xml' + when /.xhtml$/i + 'application/xml+xhtml' + when /.(gif|png|jpe?g|jp2|tiff?)$/i + "image/#{$1.sub(/^jpg$/,'jpeg').sub(/^tif$/,'tiff')}" + else + 'application/octetstream' + end + end if block_given? yield(resp) else @@ -354,27 +445,30 @@ class HttpUtil return resp end - # uri:: uri to query (Uri object or String) - # opts:: options. Currently used: - # :method:: request method [:get (default), :post or :head] - # :open_timeout:: open timeout for the proxy - # :read_timeout:: read timeout for the proxy - # :cache:: should we cache results? - # :yield:: if :final [default], call &block for the response object - # if :all, call &block for all intermediate redirects, too - # :max_redir:: how many redirects to follow before raising the exception - # if -1, don't follow redirects, just return them - # :range:: make a ranged request (usually GET). accepts a string - # for HTTP/1.1 "Range:" header (i.e. "bytes=0-1000") - # :body:: request body (usually for POST requests) - # - # Generic http transaction method + # _uri_:: uri to query (URI object or String) # - # It will return a HTTP::Response object or raise an exception + # Generic http transaction method. It will return a Net::HTTPResponse + # object or raise an exception # # If a block is given, it will yield the response (see :yield option) - - def get_response(uri_or_s, options = {}, &block) + # + # Currently supported _options_: + # + # method:: request method [:get (default), :post or :head] + # open_timeout:: open timeout for the proxy + # read_timeout:: read timeout for the proxy + # cache:: should we cache results? + # yield:: if :final [default], calls the block for the response object; + # if :all, call the block for all intermediate redirects, too + # max_redir:: how many redirects to follow before raising the exception + # if -1, don't follow redirects, just return them + # range:: make a ranged request (usually GET). accepts a string + # for HTTP/1.1 "Range:" header (i.e. "bytes=0-1000") + # body:: request body (usually for POST requests) + # headers:: additional headers to be set for the request. Its value must + # be a Hash in the form { 'Header' => 'value' } + # + def get_response(uri_or_s, options = {}, &block) # :yields: resp uri = uri_or_s.kind_of?(URI) ? uri_or_s : URI.parse(uri_or_s.to_s) opts = { :max_redir => @bot.config['http.max_redir'], @@ -426,25 +520,30 @@ class HttpUtil return handle_response(uri, cached.response, opts, &block) end end - + headers = @headers.dup.merge(opts[:headers] || {}) headers['Range'] = opts[:range] if opts[:range] + headers['Authorization'] = opts[:auth_head] if opts[:auth_head] cached.setup_headers(headers) if cached && (req_class == Net::HTTP::Get) req = req_class.new(uri.request_uri, headers) - req.basic_auth(uri.user, uri.password) if uri.user && uri.password + if uri.user && uri.password + req.basic_auth(uri.user, uri.password) + opts[:auth_head] = req['Authorization'] + end req.body = opts[:body] if req_class == Net::HTTP::Post debug "prepared request: #{req.to_hash.inspect}" + begin get_proxy(uri, opts).start do |http| http.request(req) do |resp| + resp['x-rbot-location'] = uri.to_s if Net::HTTPNotModified === resp debug "not modified" begin cached.revalidate(resp) rescue Exception => e - error e.message - error e.backtrace.join("\n") + error e end debug "reusing cached" resp = cached.response @@ -465,26 +564,37 @@ class HttpUtil return handle_response(uri, resp, opts, &block) end end + rescue Exception => e + error e + raise e.message + end end - # uri:: uri to query (Uri object) + # _uri_:: uri to query (URI object or String) + # + # Simple GET request, returns (if possible) response body following redirs + # and caching if requested, yielding the actual response(s) to the optional + # block. See get_response for details on the supported _options_ # - # simple get request, returns (if possible) response body following redirs - # and caching if requested - def get(uri, opts = {}, &block) + def get(uri, options = {}, &block) # :yields: resp begin - resp = get_response(uri, opts, &block) + resp = get_response(uri, options, &block) raise "http error: #{resp}" unless Net::HTTPOK === resp || Net::HTTPPartialContent === resp return resp.body rescue Exception => e - error e.message - error e.backtrace.join("\n") + error e end return nil end - def head(uri, options = {}, &block) + # _uri_:: uri to query (URI object or String) + # + # Simple HEAD request, returns (if possible) response head following redirs + # and caching if requested, yielding the actual response(s) to the optional + # block. See get_response for details on the supported _options_ + # + def head(uri, options = {}, &block) # :yields: resp opts = {:method => :head}.merge(options) begin resp = get_response(uri, opts, &block) @@ -492,26 +602,39 @@ class HttpUtil Net::HTTPServerError == resp return resp rescue Exception => e - error e.message - error e.backtrace.join("\n") + error e end return nil end - def post(uri, data, options = {}, &block) + # _uri_:: uri to query (URI object or String) + # _data_:: body of the POST + # + # Simple POST request, returns (if possible) response following redirs and + # caching if requested, yielding the response(s) to the optional block. See + # get_response for details on the supported _options_ + # + def post(uri, data, options = {}, &block) # :yields: resp opts = {:method => :post, :body => data, :cache => false}.merge(options) begin resp = get_response(uri, opts, &block) raise 'http error' unless Net::HTTPOK === resp return resp rescue Exception => e - error e.message - error e.backtrace.join("\n") + error e end return nil end - def get_partial(uri, nbytes = @bot.config['http.info_bytes'], options = {}, &block) + # _uri_:: uri to query (URI object or String) + # _nbytes_:: number of bytes to get + # + # Partia GET request, returns (if possible) the first _nbytes_ bytes of the + # response body, following redirs and caching if requested, yielding the + # actual response(s) to the optional block. See get_response for details on + # the supported _options_ + # + def get_partial(uri, nbytes = @bot.config['http.info_bytes'], options = {}, &block) # :yields: resp opts = {:range => "bytes=0-#{nbytes}"}.merge(options) return get(uri, opts, &block) end @@ -527,7 +650,7 @@ class HttpUtil (now - val.last_used > max_last) || (now - val.first_used > max_first) } rescue => e - error "Failed to remove stale cache: #{e.inspect}" + error "Failed to remove stale cache: #{e.pretty_inspect}" end debug "#{@cache.size} pages after" end @@ -547,6 +670,7 @@ class HttpUtilPlugin < CoreBotModule debug 'shutting down httputil' @bot.httputil.cleanup @bot.httputil = nil + super end end