From b11c3c4042b03e36639370002ecf86c44f7ddde4 Mon Sep 17 00:00:00 2001 From: Dmitry Kim Date: Fri, 30 Mar 2007 23:44:02 +0000 Subject: *** (httputil) major rework, new caching implementation, unified request processing + (httputil) post support, partial request support, other features - (httputil) removed partial_body() and get_cached() [merged into get()] * (plugins/, utils) minimal changes to accomodate for the new http_utils * (utils, ircbot) moved utils initialization into utils.rb * (tube.rb) (partially) accomodate for upstream site layout changes --- lib/rbot/core/utils/httputil.rb | 611 ++++++++++++++++++---------------------- lib/rbot/core/utils/utils.rb | 5 +- lib/rbot/ircbot.rb | 5 +- 3 files changed, 285 insertions(+), 336 deletions(-) (limited to 'lib/rbot') diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index aebd1e81..4ce8dcc3 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -5,10 +5,11 @@ # # Author:: Tom Gilbert # Author:: Giuseppe "Oblomov" Bilotta +# Author:: Dmitry "jsn" Kim # # Copyright:: (C) 2002-2005 Tom Gilbert # Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta -# Copyright:: (C) 2006,2007 Giuseppe Bilotta +# Copyright:: (C) 2007 Giuseppe Bilotta, Dmitry Kim require 'resolv' require 'net/http' @@ -19,25 +20,6 @@ rescue LoadError => e error "Secured HTTP connections will fail" end -module ::Net - class HTTPResponse - # Read chunks from the body until we have at least _size_ bytes, yielding - # the partial text at each chunk. Return the partial body. - def partial_body(size=0, &block) - - partial = String.new - - self.read_body { |chunk| - partial << chunk - yield partial if block_given? - break if size and size > 0 and partial.length >= size - } - - return partial - end - end -end - Net::HTTP.version_1_2 module ::Irc @@ -79,16 +61,113 @@ class HttpUtil :default => 8192, :desc => "How many bytes to download from a web page to find some information. Set to 0 to let the bot download the whole page.") + class CachedObject + attr_accessor :response, :last_used, :first_used, :count, :expires, :date + + def self.maybe_new(resp) + debug "maybe new #{resp}" + return nil unless Net::HTTPOK === resp || + Net::HTTPMovedPermanently === resp || + Net::HTTPFound === resp || + Net::HTTPPartialContent === resp + + cc = resp['cache-control'] + return nil if cc && (cc =~ /no-cache/i) + + date = Time.now + if d = resp['date'] + date = Time.httpdate(d) + end + + return nil if resp['expires'] && (Time.httpdate(resp['expires']) < date) + + debug "creating cache obj" + + self.new(resp) + end + + def use + now = Time.now + @first_used = now if @count == 0 + @last_used = now + @count += 1 + end + + def expired? + debug "checking expired?" + if cc = self.response['cache-control'] && cc =~ /must-revalidate/ + return true + end + return self.expires < Time.now + end + + def setup_headers(hdr) + hdr['if-modified-since'] = self.date.rfc2822 + + debug "ims == #{hdr['if-modified-since']}" + + if etag = self.response['etag'] + hdr['if-none-match'] = etag + debug "etag: #{etag}" + end + end + + def revalidate(resp = self.response) + @count = 0 + self.use + self.date = resp.key?('date') ? Time.httpdate(resp['date']) : Time.now + + cc = resp['cache-control'] + if cc && (cc =~ /max-age=(\d+)/) + self.expires = self.date + $1.to_i + elsif resp.key?('expires') + self.expires = Time.httpdate(resp['expires']) + elsif lm = resp['last-modified'] + delta = self.date - Time.httpdate(lm) + delta = 10 if delta <= 0 + delta /= 5 + self.expires = self.date + delta + else + self.expires = self.date + 300 + end + # self.expires = Time.now + 10 # DEBUG + debug "expires on #{self.expires}" + + return true + end + + private + def initialize(resp) + @response = resp + begin + self.revalidate + self.response.body + rescue Exception => e + error e.message + error e.backtrace.join("\n") + raise e + end + end + end + def initialize(bot) @bot = bot @cache = Hash.new @headers = { - 'User-Agent' => "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)", + 'Accept-Charset' => 'utf-8;q=1.0, *;q=0.8', + 'User-Agent' => + "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)" + } + debug "starting http cache cleanup timer" + @timer = @bot.timer.add(300) { + self.remove_stale_cache unless @bot.config['http.no_expire_cache'] } - @last_response = nil + end + + def cleanup + debug 'stopping http cache cleanup timer' + @bot.timer.remove(@timer) end - attr_reader :last_response - attr_reader :headers # if http_proxy_include or http_proxy_exclude are set, then examine the # uri to see if this is a proxied uri @@ -139,7 +218,13 @@ class HttpUtil # proxying based on the bot's proxy configuration. # This will include per-url proxy configuration based on the bot config # +http_proxy_include/exclude+ options. - def get_proxy(uri) + + def get_proxy(uri, options = {}) + opts = { + :read_timeout => 10, + :open_timeout => 5 + }.merge(options) + proxy = nil proxy_host = nil proxy_port = nil @@ -166,363 +251,227 @@ class HttpUtil h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port) h.use_ssl = true if uri.scheme == "https" + + h.read_timeout = opts[:read_timeout] + h.open_timeout = opts[:open_timeout] return h end - # uri:: uri to query (Uri object) - # readtimeout:: timeout for reading the response - # opentimeout:: timeout for opening the connection - # - # simple get request, returns (if possible) response body following redirs - # and caching if requested - # if a block is given, it yields the urls it gets redirected to - # TODO we really need something to implement proper caching - def get(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"], cache=false) - if uri_or_str.kind_of?(URI) - uri = uri_or_str - else - uri = URI.parse(uri_or_str.to_s) - end - debug "Getting #{uri}" - - proxy = get_proxy(uri) - proxy.open_timeout = opentimeout - proxy.read_timeout = readtimeout - - begin - proxy.start() {|http| - yield uri.request_uri() if block_given? - req = Net::HTTP::Get.new(uri.request_uri(), @headers) - if uri.user and uri.password - req.basic_auth(uri.user, uri.password) - end - resp = http.request(req) - case resp - when Net::HTTPSuccess - if cache - debug "Caching #{uri.to_s}" - cache_response(uri.to_s, resp) - end - return resp.body - when Net::HTTPRedirection - if resp.key?('location') - new_loc = URI.join(uri, resp['location']) - debug "Redirecting #{uri} to #{new_loc}" - yield new_loc if block_given? - if max_redir > 0 - # If cache is an Array, we assume get was called by get_cached - # because of a cache miss and that the first value of the Array - # was the noexpire value. Since the cache miss might have been - # caused by a redirection, we want to try get_cached again - # TODO FIXME look at Python's httplib2 for a most likely - # better way to handle all this mess - if cache.kind_of?(Array) - return get_cached(new_loc, readtimeout, opentimeout, max_redir-1, cache[0]) - else - return get(new_loc, readtimeout, opentimeout, max_redir-1, cache) - end - else - warning "Max redirection reached, not going to #{new_loc}" - end - else - warning "Unknown HTTP redirection #{resp.inspect}" - end - else - debug "HttpUtil.get return code #{resp.code} #{resp.body}" + def handle_response(uri, resp, opts, &block) + if Net::HTTPRedirection === resp && opts[:max_redir] >= 0 + if resp.key?('location') + raise 'Too many redirections' if opts[:max_redir] <= 0 + yield resp if opts[:yield] == :all && block_given? + loc = resp['location'] + new_loc = URI.join(uri.to_s, loc) rescue URI.parse(loc) + new_opts = opts.dup + new_opts[:max_redir] -= 1 + case opts[:method].to_s.downcase.intern + when :post, :"net::http::post" + new_opts[:method] = :get end - @last_response = resp - return nil - } - rescue StandardError, Timeout::Error => e - error "HttpUtil.get exception: #{e.inspect}, while trying to get #{uri}" - debug e.backtrace.join("\n") + debug "following the redirect to #{new_loc}" + return get_response(new_loc, new_opts, &block) + else + warning ":| redirect w/o location?" + end end - @last_response = nil - return nil - end - - # just like the above, but only gets the head - def head(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"]) - if uri_or_str.kind_of?(URI) - uri = uri_or_str + if block_given? + yield(resp) else - uri = URI.parse(uri_or_str.to_s) + resp.body end - proxy = get_proxy(uri) - proxy.open_timeout = opentimeout - proxy.read_timeout = readtimeout - - begin - proxy.start() {|http| - yield uri.request_uri() if block_given? - req = Net::HTTP::Head.new(uri.request_uri(), @headers) - if uri.user and uri.password - req.basic_auth(uri.user, uri.password) - end - resp = http.request(req) - case resp - when Net::HTTPSuccess - return resp - when Net::HTTPRedirection - debug "Redirecting #{uri} to #{resp['location']}" - yield resp['location'] if block_given? - if max_redir > 0 - return head( URI.parse(resp['location']), readtimeout, opentimeout, max_redir-1) - else - warning "Max redirection reached, not going to #{resp['location']}" - end - else - debug "HttpUtil.head return code #{resp.code}" - end - @last_response = resp - return nil - } - rescue StandardError, Timeout::Error => e - error "HttpUtil.head exception: #{e.inspect}, while trying to get #{uri}" - debug e.backtrace.join("\n") - end - @last_response = nil - return nil + return resp end # uri:: uri to query (Uri object or String) # opts:: options. Currently used: + # :method:: request method [:get (default), :post or :head] # :open_timeout:: open timeout for the proxy # :read_timeout:: read timeout for the proxy # :cache:: should we cache results? + # :yield:: if :final [default], call &block for the response object + # if :all, call &block for all intermediate redirects, too + # :max_redir:: how many redirects to follow before raising the exception + # if -1, don't follow redirects, just return them + # :range:: make a ranged request (usually GET). accepts a string + # for HTTP/1.1 "Range:" header (i.e. "bytes=0-1000") + # :body:: request body (usually for POST requests) # - # This method is used to get responses following redirections. + # Generic http transaction method # - # It will return either a Net::HTTPResponse or an error. + # It will return a HTTP::Response object or raise an exception # - # If a block is given, it will yield the response or error instead of - # returning it - # - def get_response(uri_or_str, opts={}, &block) - if uri_or_str.kind_of?(URI) - uri = uri_or_str - else - uri = URI.parse(uri_or_str.to_s) + # If a block is given, it will yield the response (see :yield option) + + def get_response(uri_or_s, options = {}, &block) + uri = uri_or_s.kind_of?(URI) ? uri_or_s : URI.parse(uri_or_s.to_s) + opts = { + :max_redir => @bot.config['http.max_redir'], + :yield => :final, + :cache => true, + :method => :GET + }.merge(options) + + resp = nil + cached = nil + + req_class = case opts[:method].to_s.downcase.intern + when :head, :"net::http::head" + opts[:max_redir] = -1 + Net::HTTP::Head + when :get, :"net::http::get" + Net::HTTP::Get + when :post, :"net::http::post" + opts[:cache] = false + opts[:body] or raise 'post request w/o a body?' + warning "refusing to cache POST request" if options[:cache] + Net::HTTP::Post + else + warning "unsupported method #{opts[:method]}, doing GET" + Net::HTTP::Get + end + + if req_class != Net::HTTP::Get && opts[:range] + warning "can't request ranges for #{req_class}" + opts.delete(:range) end - debug "Getting #{uri}" - options = { - :read_timeout => 10, - :open_timeout => 5, - :max_redir => @bot.config["http.max_redir"], - :cache => false, - :yield => :none - }.merge(opts) - - cache = options[:cache] + cache_key = "#{opts[:range]}|#{req_class}|#{uri.to_s}" - proxy = get_proxy(uri) - proxy.open_timeout = options[:open_timeout] - proxy.read_timeout = options[:read_timeout] - - begin - proxy.start() {|http| - req = Net::HTTP::Get.new(uri.request_uri(), @headers) - if uri.user and uri.password - req.basic_auth(uri.user, uri.password) - end - http.request(req) { |resp| - case resp - when Net::HTTPSuccess - if cache - debug "Caching #{uri.to_s}" - cache_response(uri.to_s, resp) - end - when Net::HTTPRedirection - if resp.key?('location') - new_loc = URI.join(uri, resp['location']) rescue URI.parse(resp['location']) - debug "Redirecting #{uri} to #{new_loc}" - if options[:max_redir] > 0 - new_opts = options.dup - new_opts[:max_redir] -= 1 - return get_response(new_loc, new_opts, &block) - else - raise "Too many redirections" - end - end - end - if block_given? - yield resp - else - return resp - end - } - } - rescue StandardError, Timeout::Error => e - error "HttpUtil.get_response exception: #{e.inspect}, while trying to get #{uri}" - debug e.backtrace.join("\n") - def e.body - nil - end - if block_given? - yield e - else - return e + if req_class != Net::HTTP::Get && req_class != Net::HTTP::Head + if opts[:cache] + warning "can't cache #{req_class.inspect} requests, working w/o cache" + opts[:cache] = false end end - raise "This shouldn't happen" - end + debug "get_response(#{uri}, #{opts.inspect})" - def cache_response(k, resp) - begin - if resp.key?('pragma') and resp['pragma'] == 'no-cache' - debug "Not caching #{k}, it has Pragma: no-cache" - return + if opts[:cache] && cached = @cache[cache_key] + debug "got cached" + if !cached.expired? + debug "using cached" + cached.use + return handle_response(uri, cached.response, opts, &block) end - # TODO should we skip caching if neither last-modified nor etag are present? - now = Time.new - u = Hash.new - u = Hash.new - u[:body] = resp.body - u[:last_modified] = nil - u[:last_modified] = Time.httpdate(resp['date']) if resp.key?('date') - u[:last_modified] = Time.httpdate(resp['last-modified']) if resp.key?('last-modified') - u[:expires] = now - u[:expires] = Time.httpdate(resp['expires']) if resp.key?('expires') - u[:revalidate] = false - if resp.key?('cache-control') - # TODO max-age - case resp['cache-control'] - when /no-cache|must-revalidate/ - u[:revalidate] = true - end + end + + headers = @headers.dup.merge(opts[:headers] || {}) + headers['Range'] = opts[:range] if opts[:range] + + cached.setup_headers(headers) if cached && (req_class == Net::HTTP::Get) + req = req_class.new(uri.request_uri, headers) + req.basic_auth(uri.user, uri.password) if uri.user && uri.password + req.body = opts[:body] if req_class == Net::HTTP::Post + debug "prepared request: #{req.to_hash.inspect}" + + get_proxy(uri, opts).start do |http| + http.request(req) do |resp| + if Net::HTTPNotModified === resp + debug "not modified" + begin + cached.revalidate(resp) + rescue Exception => e + error e.message + error e.backtrace.join("\n") + end + debug "reusing cached" + resp = cached.response + elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp + debug "http error, deleting cached obj" if cached + @cache.delete(cache_key) + elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil + debug "storing to cache" + @cache[cache_key] = cached + end + return handle_response(uri, resp, opts, &block) end - u[:etag] = "" - u[:etag] = resp['etag'] if resp.key?('etag') - u[:count] = 1 - u[:first_use] = now - u[:last_use] = now - rescue => e - error "Failed to cache #{k}/#{resp.to_hash.inspect}: #{e.inspect}" - return end - @cache[k] = u - debug "Cached #{k}/#{resp.to_hash.inspect}: #{u.inspect_no_body}" - debug "#{@cache.size} pages (#{@cache.keys.join(', ')}) cached up to now" end - # For debugging purposes - class ::Hash - def inspect_no_body - temp = self.dup - temp.delete(:body) - temp.inspect + # uri:: uri to query (Uri object) + # + # simple get request, returns (if possible) response body following redirs + # and caching if requested + def get(uri, opts = {}, &block) + begin + resp = get_response(uri, opts, &block) + raise "http error: #{resp}" unless Net::HTTPOK === resp || + Net::HTTPPartialContent === resp + return resp.body + rescue Exception => e + error e.message + error e.backtrace.join("\n") end + return nil end - def expired?(uri, readtimeout, opentimeout) - k = uri.to_s - debug "Checking cache validity for #{k}" + def head(uri, options = {}, &block) + opts = {:method => :head}.merge(options) begin - return true unless @cache.key?(k) - u = @cache[k] - - # TODO we always revalidate for the time being - - if u[:etag].empty? and u[:last_modified].nil? - # TODO max-age - return true - end - - proxy = get_proxy(uri) - proxy.open_timeout = opentimeout - proxy.read_timeout = readtimeout - - proxy.start() {|http| - yield uri.request_uri() if block_given? - headers = @headers.dup - headers['If-None-Match'] = u[:etag] unless u[:etag].empty? - headers['If-Modified-Since'] = u[:last_modified].rfc2822 if u[:last_modified] - debug "Cache HEAD request headers: #{headers.inspect}" - # FIXME TODO We might want to use a Get here - # because if a 200 OK is returned we would get the new body - # with one connection less ... - req = Net::HTTP::Head.new(uri.request_uri(), headers) - if uri.user and uri.password - req.basic_auth(uri.user, uri.password) - end - resp = http.request(req) - debug "Checking cache validity of #{u.inspect_no_body} against #{resp.inspect}/#{resp.to_hash.inspect}" - case resp - when Net::HTTPNotModified - return false - else - return true - end - } - rescue => e - error "Failed to check cache validity for #{uri}: #{e.inspect}" - return true + resp = get_response(uri, opts, &block) + raise "http error #{resp}" if Net::HTTPClientError === resp || + Net::HTTPServerError == resp + return resp + rescue Exception => e + error e.message + error e.backtrace.join("\n") end + return nil end - # gets a page from the cache if it's still (assumed to be) valid - # TODO remove stale cached pages, except when called with noexpire=true - def get_cached(uri_or_str, readtimeout=10, opentimeout=5, - max_redir=@bot.config['http.max_redir'], - noexpire=@bot.config['http.no_expire_cache']) - if uri_or_str.kind_of?(URI) - uri = uri_or_str - else - uri = URI.parse(uri_or_str.to_s) - end - debug "Getting cached #{uri}" - - if expired?(uri, readtimeout, opentimeout) - debug "Cache expired" - bod = get(uri, readtimeout, opentimeout, max_redir, [noexpire]) - bod.instance_variable_set(:@cached,false) - else - k = uri.to_s - debug "Using cache" - @cache[k][:count] += 1 - @cache[k][:last_use] = Time.now - bod = @cache[k][:body] - bod.instance_variable_set(:@cached,true) - end - unless noexpire - remove_stale_cache - end - unless bod.respond_to?(:cached?) - def bod.cached? - return @cached - end + def post(uri, data, options = {}, &block) + opts = {:method => :post, :body => data, :cache => false}.merge(options) + begin + resp = get_response(uri, opts, &block) + raise 'http error' unless Net::HTTPOK === resp + return resp + rescue Exception => e + error e.message + error e.backtrace.join("\n") end - return bod + return nil end - # We consider a page to be manually expired if it has no - # etag and no last-modified and if any of the expiration - # conditions are met (expire_time, max_cache_time, Expires) - def manually_expired?(hash, time) - auto = hash[:etag].empty? and hash[:last_modified].nil? - # TODO max-age - manual = (time - hash[:last_use] > @bot.config['http.expire_time']*60) or - (time - hash[:first_use] > @bot.config['http.max_cache_time']*60) or - (hash[:expires] < time) - return (auto and manual) + def get_partial(uri, nbytes = @bot.config['http.info_bytes'], options = {}, &block) + opts = {:range => "bytes=0-#{nbytes}"}.merge(options) + return get(uri, opts, &block) end def remove_stale_cache debug "Removing stale cache" + now = Time.new + max_last = @bot.config['http.expire_time'] * 60 + max_first = @bot.config['http.max_cache_time'] * 60 debug "#{@cache.size} pages before" begin - now = Time.new - @cache.reject! { |k, val| - manually_expired?(val, now) - } + @cache.reject! { |k, val| + (now - val.last_used > max_last) || (now - val.first_used > max_first) + } rescue => e error "Failed to remove stale cache: #{e.inspect}" end debug "#{@cache.size} pages after" end + end end end + +class HttpUtilPlugin < CoreBotModule + def initialize(*a) + super(*a) + debug 'initializing httputil' + @bot.httputil = Irc::Utils::HttpUtil.new(@bot) + end + + def cleanup + debug 'shutting down httputil' + @bot.httputil.cleanup + @bot.httputil = nil + end +end + +HttpUtilPlugin.new diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 251e7986..717630e3 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -318,6 +318,7 @@ module ::Irc end def Utils.bot=(b) + debug "initializing utils" @@bot = b @@safe_save_dir = "#{@@bot.botclass}/safe_save" end @@ -523,7 +524,7 @@ module ::Irc # FIXME what happens if some big file is returned? We should share # code with the url plugin to only retrieve partial file content! - xml = self.bot.httputil.get_cached(url) + xml = self.bot.httputil.get(url) if xml.nil? debug "Unable to retrieve #{url}" next @@ -549,3 +550,5 @@ module ::Irc end end + +Irc::Utils.bot = Irc::Plugins.manager.bot diff --git a/lib/rbot/ircbot.rb b/lib/rbot/ircbot.rb index 93d65200..54782f70 100644 --- a/lib/rbot/ircbot.rb +++ b/lib/rbot/ircbot.rb @@ -124,7 +124,7 @@ class Bot # bot's httputil help object, for fetching resources via http. Sets up # proxies etc as defined by the bot configuration/environment - attr_reader :httputil + attr_accessor :httputil # server we are connected to # TODO multiserver @@ -452,9 +452,6 @@ class Bot @plugins.scan - Utils.bot = self - @httputil = Utils::HttpUtil.new(self) - # Channels where we are quiet # Array of channels names where the bot should be quiet # '*' means all channels -- cgit v1.2.3