4 # :title: rbot HTTP provider
6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
9 # Copyright:: (C) 2002-2005 Tom Gilbert
10 # Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta
11 # Copyright:: (C) 2006,2007 Giuseppe Bilotta
18 error "Couldn't load 'net/https': #{e.inspect}"
19 error "Secured HTTP connections will fail"
24 # Read chunks from the body until we have at least _size_ bytes, yielding
25 # the partial text at each chunk. Return the partial body.
26 def partial_body(size, &block)
30 self.read_body { |chunk|
33 break if size and partial.length >= size
46 # class for making http requests easier (mainly for plugins to use)
47 # this class can check the bot proxy configuration to determine if a proxy
48 # needs to be used, which includes support for per-url proxy configuration.
50 BotConfig.register BotConfigBooleanValue.new('http.use_proxy',
51 :default => false, :desc => "should a proxy be used for HTTP requests?")
52 BotConfig.register BotConfigStringValue.new('http.proxy_uri', :default => false,
53 :desc => "Proxy server to use for HTTP requests (URI, e.g http://proxy.host:port)")
54 BotConfig.register BotConfigStringValue.new('http.proxy_user',
56 :desc => "User for authenticating with the http proxy (if required)")
57 BotConfig.register BotConfigStringValue.new('http.proxy_pass',
59 :desc => "Password for authenticating with the http proxy (if required)")
60 BotConfig.register BotConfigArrayValue.new('http.proxy_include',
62 :desc => "List of regexps to check against a URI's hostname/ip to see if we should use the proxy to access this URI. All URIs are proxied by default if the proxy is set, so this is only required to re-include URIs that might have been excluded by the exclude list. e.g. exclude /.*\.foo\.com/, include bar\.foo\.com")
63 BotConfig.register BotConfigArrayValue.new('http.proxy_exclude',
65 :desc => "List of regexps to check against a URI's hostname/ip to see if we should use avoid the proxy to access this URI and access it directly")
66 BotConfig.register BotConfigIntegerValue.new('http.max_redir',
68 :desc => "Maximum number of redirections to be used when getting a document")
69 BotConfig.register BotConfigIntegerValue.new('http.expire_time',
71 :desc => "After how many minutes since last use a cached document is considered to be expired")
72 BotConfig.register BotConfigIntegerValue.new('http.max_cache_time',
74 :desc => "After how many minutes since first use a cached document is considered to be expired")
75 BotConfig.register BotConfigIntegerValue.new('http.no_expire_cache',
77 :desc => "Set this to true if you want the bot to never expire the cached pages")
83 'User-Agent' => "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)",
87 attr_reader :last_response
90 # if http_proxy_include or http_proxy_exclude are set, then examine the
91 # uri to see if this is a proxied uri
92 # the in/excludes are a list of regexps, and each regexp is checked against
93 # the server name, and its IP addresses
94 def proxy_required(uri)
96 if @bot.config["http.proxy_exclude"].empty? && @bot.config["http.proxy_include"].empty?
102 list.concat Resolv.getaddresses(uri.host)
103 rescue StandardError => err
104 warning "couldn't resolve host uri.host"
107 unless @bot.config["http.proxy_exclude"].empty?
108 re = @bot.config["http.proxy_exclude"].collect{|r| Regexp.new(r)}
118 unless @bot.config["http.proxy_include"].empty?
119 re = @bot.config["http.proxy_include"].collect{|r| Regexp.new(r)}
129 debug "using proxy for uri #{uri}?: #{use_proxy}"
133 # uri:: Uri to create a proxy for
135 # return a net/http Proxy object, which is configured correctly for
136 # proxying based on the bot's proxy configuration.
137 # This will include per-url proxy configuration based on the bot config
138 # +http_proxy_include/exclude+ options.
146 if @bot.config["http.use_proxy"]
147 if (ENV['http_proxy'])
148 proxy = URI.parse ENV['http_proxy'] rescue nil
150 if (@bot.config["http.proxy_uri"])
151 proxy = URI.parse @bot.config["http.proxy_uri"] rescue nil
154 debug "proxy is set to #{proxy.host} port #{proxy.port}"
155 if proxy_required(uri)
156 proxy_host = proxy.host
157 proxy_port = proxy.port
158 proxy_user = @bot.config["http.proxy_user"]
159 proxy_pass = @bot.config["http.proxy_pass"]
164 h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port)
165 h.use_ssl = true if uri.scheme == "https"
169 # uri:: uri to query (Uri object)
170 # readtimeout:: timeout for reading the response
171 # opentimeout:: timeout for opening the connection
173 # simple get request, returns (if possible) response body following redirs
174 # and caching if requested
175 # if a block is given, it yields the urls it gets redirected to
176 # TODO we really need something to implement proper caching
177 def get(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"], cache=false)
178 if uri_or_str.kind_of?(URI)
181 uri = URI.parse(uri_or_str.to_s)
183 debug "Getting #{uri}"
185 proxy = get_proxy(uri)
186 proxy.open_timeout = opentimeout
187 proxy.read_timeout = readtimeout
190 proxy.start() {|http|
191 yield uri.request_uri() if block_given?
192 req = Net::HTTP::Get.new(uri.request_uri(), @headers)
193 if uri.user and uri.password
194 req.basic_auth(uri.user, uri.password)
196 resp = http.request(req)
198 when Net::HTTPSuccess
200 debug "Caching #{uri.to_s}"
201 cache_response(uri.to_s, resp)
204 when Net::HTTPRedirection
205 if resp.key?('location')
206 new_loc = URI.join(uri, resp['location'])
207 debug "Redirecting #{uri} to #{new_loc}"
208 yield new_loc if block_given?
210 # If cache is an Array, we assume get was called by get_cached
211 # because of a cache miss and that the first value of the Array
212 # was the noexpire value. Since the cache miss might have been
213 # caused by a redirection, we want to try get_cached again
214 # TODO FIXME look at Python's httplib2 for a most likely
215 # better way to handle all this mess
216 if cache.kind_of?(Array)
217 return get_cached(new_loc, readtimeout, opentimeout, max_redir-1, cache[0])
219 return get(new_loc, readtimeout, opentimeout, max_redir-1, cache)
222 warning "Max redirection reached, not going to #{new_loc}"
225 warning "Unknown HTTP redirection #{resp.inspect}"
228 debug "HttpUtil.get return code #{resp.code} #{resp.body}"
230 @last_response = resp
233 rescue StandardError, Timeout::Error => e
234 error "HttpUtil.get exception: #{e.inspect}, while trying to get #{uri}"
235 debug e.backtrace.join("\n")
241 # just like the above, but only gets the head
242 def head(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"])
243 if uri_or_str.kind_of?(URI)
246 uri = URI.parse(uri_or_str.to_s)
249 proxy = get_proxy(uri)
250 proxy.open_timeout = opentimeout
251 proxy.read_timeout = readtimeout
254 proxy.start() {|http|
255 yield uri.request_uri() if block_given?
256 req = Net::HTTP::Head.new(uri.request_uri(), @headers)
257 if uri.user and uri.password
258 req.basic_auth(uri.user, uri.password)
260 resp = http.request(req)
262 when Net::HTTPSuccess
264 when Net::HTTPRedirection
265 debug "Redirecting #{uri} to #{resp['location']}"
266 yield resp['location'] if block_given?
268 return head( URI.parse(resp['location']), readtimeout, opentimeout, max_redir-1)
270 warning "Max redirection reached, not going to #{resp['location']}"
273 debug "HttpUtil.head return code #{resp.code}"
275 @last_response = resp
278 rescue StandardError, Timeout::Error => e
279 error "HttpUtil.head exception: #{e.inspect}, while trying to get #{uri}"
280 debug e.backtrace.join("\n")
286 # uri:: uri to query (Uri object or String)
287 # opts:: options. Currently used:
288 # :open_timeout:: open timeout for the proxy
289 # :read_timeout:: read timeout for the proxy
290 # :cache:: should we cache results?
292 # This method is used to get responses following redirections.
294 # It will return either a Net::HTTPResponse or an error.
296 # If a block is given, it will yield the response or error instead of
299 def get_response(uri_or_str, opts={}, &block)
300 if uri_or_str.kind_of?(URI)
303 uri = URI.parse(uri_or_str.to_s)
305 debug "Getting #{uri}"
310 :max_redir => @bot.config["http.max_redir"],
315 cache = options[:cache]
317 proxy = get_proxy(uri)
318 proxy.open_timeout = options[:open_timeout]
319 proxy.read_timeout = options[:read_timeout]
322 proxy.start() {|http|
323 req = Net::HTTP::Get.new(uri.request_uri(), @headers)
324 if uri.user and uri.password
325 req.basic_auth(uri.user, uri.password)
327 http.request(req) { |resp|
329 when Net::HTTPSuccess
331 debug "Caching #{uri.to_s}"
332 cache_response(uri.to_s, resp)
334 when Net::HTTPRedirection
335 if resp.key?('location')
336 new_loc = URI.join(uri, resp['location']) rescue URI.parse(resp['location'])
337 debug "Redirecting #{uri} to #{new_loc}"
338 if options[:max_redir] > 0
339 new_opts = options.dup
340 new_opts[:max_redir] -= 1
341 return get_response(new_loc, new_opts, &block)
343 raise "Too many redirections"
354 rescue StandardError, Timeout::Error => e
355 error "HttpUtil.get_response exception: #{e.inspect}, while trying to get #{uri}"
356 debug e.backtrace.join("\n")
367 raise "This shouldn't happen"
370 def cache_response(k, resp)
372 if resp.key?('pragma') and resp['pragma'] == 'no-cache'
373 debug "Not caching #{k}, it has Pragma: no-cache"
376 # TODO should we skip caching if neither last-modified nor etag are present?
381 u[:last_modified] = nil
382 u[:last_modified] = Time.httpdate(resp['date']) if resp.key?('date')
383 u[:last_modified] = Time.httpdate(resp['last-modified']) if resp.key?('last-modified')
385 u[:expires] = Time.httpdate(resp['expires']) if resp.key?('expires')
386 u[:revalidate] = false
387 if resp.key?('cache-control')
389 case resp['cache-control']
390 when /no-cache|must-revalidate/
391 u[:revalidate] = true
395 u[:etag] = resp['etag'] if resp.key?('etag')
400 error "Failed to cache #{k}/#{resp.to_hash.inspect}: #{e.inspect}"
404 debug "Cached #{k}/#{resp.to_hash.inspect}: #{u.inspect_no_body}"
405 debug "#{@cache.size} pages (#{@cache.keys.join(', ')}) cached up to now"
408 # For debugging purposes
417 def expired?(uri, readtimeout, opentimeout)
419 debug "Checking cache validity for #{k}"
421 return true unless @cache.key?(k)
424 # TODO we always revalidate for the time being
426 if u[:etag].empty? and u[:last_modified].nil?
431 proxy = get_proxy(uri)
432 proxy.open_timeout = opentimeout
433 proxy.read_timeout = readtimeout
435 proxy.start() {|http|
436 yield uri.request_uri() if block_given?
437 headers = @headers.dup
438 headers['If-None-Match'] = u[:etag] unless u[:etag].empty?
439 headers['If-Modified-Since'] = u[:last_modified].rfc2822 if u[:last_modified]
440 debug "Cache HEAD request headers: #{headers.inspect}"
441 # FIXME TODO We might want to use a Get here
442 # because if a 200 OK is returned we would get the new body
443 # with one connection less ...
444 req = Net::HTTP::Head.new(uri.request_uri(), headers)
445 if uri.user and uri.password
446 req.basic_auth(uri.user, uri.password)
448 resp = http.request(req)
449 debug "Checking cache validity of #{u.inspect_no_body} against #{resp.inspect}/#{resp.to_hash.inspect}"
451 when Net::HTTPNotModified
458 error "Failed to check cache validity for #{uri}: #{e.inspect}"
463 # gets a page from the cache if it's still (assumed to be) valid
464 # TODO remove stale cached pages, except when called with noexpire=true
465 def get_cached(uri_or_str, readtimeout=10, opentimeout=5,
466 max_redir=@bot.config['http.max_redir'],
467 noexpire=@bot.config['http.no_expire_cache'])
468 if uri_or_str.kind_of?(URI)
471 uri = URI.parse(uri_or_str.to_s)
473 debug "Getting cached #{uri}"
475 if expired?(uri, readtimeout, opentimeout)
476 debug "Cache expired"
477 bod = get(uri, readtimeout, opentimeout, max_redir, [noexpire])
478 bod.instance_variable_set(:@cached,false)
482 @cache[k][:count] += 1
483 @cache[k][:last_use] = Time.now
484 bod = @cache[k][:body]
485 bod.instance_variable_set(:@cached,true)
490 unless bod.respond_to?(:cached?)
498 # We consider a page to be manually expired if it has no
499 # etag and no last-modified and if any of the expiration
500 # conditions are met (expire_time, max_cache_time, Expires)
501 def manually_expired?(hash, time)
502 auto = hash[:etag].empty? and hash[:last_modified].nil?
504 manual = (time - hash[:last_use] > @bot.config['http.expire_time']*60) or
505 (time - hash[:first_use] > @bot.config['http.max_cache_time']*60) or
506 (hash[:expires] < time)
507 return (auto and manual)
510 def remove_stale_cache
511 debug "Removing stale cache"
512 debug "#{@cache.size} pages before"
515 @cache.reject! { |k, val|
516 manually_expired?(val, now)
519 error "Failed to remove stale cache: #{e.inspect}"
521 debug "#{@cache.size} pages after"