4 # :title: rbot HTTP provider
6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
9 # Copyright:: (C) 2002-2005 Tom Gilbert
10 # Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta
11 # Copyright:: (C) 2006,2007 Giuseppe Bilotta
18 error "Couldn't load 'net/https': #{e.inspect}"
19 error "Secured HTTP connections will fail"
24 # Read chunks from the body until we have at least _size_ bytes, yielding
25 # the partial text at each chunk. Return the partial body.
26 def partial_body(size=0, &block)
30 self.read_body { |chunk|
32 yield partial if block_given?
33 break if size and size > 0 and partial.length >= size
46 # class for making http requests easier (mainly for plugins to use)
47 # this class can check the bot proxy configuration to determine if a proxy
48 # needs to be used, which includes support for per-url proxy configuration.
50 BotConfig.register BotConfigBooleanValue.new('http.use_proxy',
51 :default => false, :desc => "should a proxy be used for HTTP requests?")
52 BotConfig.register BotConfigStringValue.new('http.proxy_uri', :default => false,
53 :desc => "Proxy server to use for HTTP requests (URI, e.g http://proxy.host:port)")
54 BotConfig.register BotConfigStringValue.new('http.proxy_user',
56 :desc => "User for authenticating with the http proxy (if required)")
57 BotConfig.register BotConfigStringValue.new('http.proxy_pass',
59 :desc => "Password for authenticating with the http proxy (if required)")
60 BotConfig.register BotConfigArrayValue.new('http.proxy_include',
62 :desc => "List of regexps to check against a URI's hostname/ip to see if we should use the proxy to access this URI. All URIs are proxied by default if the proxy is set, so this is only required to re-include URIs that might have been excluded by the exclude list. e.g. exclude /.*\.foo\.com/, include bar\.foo\.com")
63 BotConfig.register BotConfigArrayValue.new('http.proxy_exclude',
65 :desc => "List of regexps to check against a URI's hostname/ip to see if we should use avoid the proxy to access this URI and access it directly")
66 BotConfig.register BotConfigIntegerValue.new('http.max_redir',
68 :desc => "Maximum number of redirections to be used when getting a document")
69 BotConfig.register BotConfigIntegerValue.new('http.expire_time',
71 :desc => "After how many minutes since last use a cached document is considered to be expired")
72 BotConfig.register BotConfigIntegerValue.new('http.max_cache_time',
74 :desc => "After how many minutes since first use a cached document is considered to be expired")
75 BotConfig.register BotConfigIntegerValue.new('http.no_expire_cache',
77 :desc => "Set this to true if you want the bot to never expire the cached pages")
78 BotConfig.register BotConfigIntegerValue.new('http.info_bytes',
80 :desc => "How many bytes to download from a web page to find some information. Set to 0 to let the bot download the whole page.")
86 'User-Agent' => "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)",
90 attr_reader :last_response
93 # if http_proxy_include or http_proxy_exclude are set, then examine the
94 # uri to see if this is a proxied uri
95 # the in/excludes are a list of regexps, and each regexp is checked against
96 # the server name, and its IP addresses
97 def proxy_required(uri)
99 if @bot.config["http.proxy_exclude"].empty? && @bot.config["http.proxy_include"].empty?
105 list.concat Resolv.getaddresses(uri.host)
106 rescue StandardError => err
107 warning "couldn't resolve host uri.host"
110 unless @bot.config["http.proxy_exclude"].empty?
111 re = @bot.config["http.proxy_exclude"].collect{|r| Regexp.new(r)}
121 unless @bot.config["http.proxy_include"].empty?
122 re = @bot.config["http.proxy_include"].collect{|r| Regexp.new(r)}
132 debug "using proxy for uri #{uri}?: #{use_proxy}"
136 # uri:: Uri to create a proxy for
138 # return a net/http Proxy object, which is configured correctly for
139 # proxying based on the bot's proxy configuration.
140 # This will include per-url proxy configuration based on the bot config
141 # +http_proxy_include/exclude+ options.
149 if @bot.config["http.use_proxy"]
150 if (ENV['http_proxy'])
151 proxy = URI.parse ENV['http_proxy'] rescue nil
153 if (@bot.config["http.proxy_uri"])
154 proxy = URI.parse @bot.config["http.proxy_uri"] rescue nil
157 debug "proxy is set to #{proxy.host} port #{proxy.port}"
158 if proxy_required(uri)
159 proxy_host = proxy.host
160 proxy_port = proxy.port
161 proxy_user = @bot.config["http.proxy_user"]
162 proxy_pass = @bot.config["http.proxy_pass"]
167 h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port)
168 h.use_ssl = true if uri.scheme == "https"
172 # uri:: uri to query (Uri object)
173 # readtimeout:: timeout for reading the response
174 # opentimeout:: timeout for opening the connection
176 # simple get request, returns (if possible) response body following redirs
177 # and caching if requested
178 # if a block is given, it yields the urls it gets redirected to
179 # TODO we really need something to implement proper caching
180 def get(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"], cache=false)
181 if uri_or_str.kind_of?(URI)
184 uri = URI.parse(uri_or_str.to_s)
186 debug "Getting #{uri}"
188 proxy = get_proxy(uri)
189 proxy.open_timeout = opentimeout
190 proxy.read_timeout = readtimeout
193 proxy.start() {|http|
194 yield uri.request_uri() if block_given?
195 req = Net::HTTP::Get.new(uri.request_uri(), @headers)
196 if uri.user and uri.password
197 req.basic_auth(uri.user, uri.password)
199 resp = http.request(req)
201 when Net::HTTPSuccess
203 debug "Caching #{uri.to_s}"
204 cache_response(uri.to_s, resp)
207 when Net::HTTPRedirection
208 if resp.key?('location')
209 new_loc = URI.join(uri, resp['location'])
210 debug "Redirecting #{uri} to #{new_loc}"
211 yield new_loc if block_given?
213 # If cache is an Array, we assume get was called by get_cached
214 # because of a cache miss and that the first value of the Array
215 # was the noexpire value. Since the cache miss might have been
216 # caused by a redirection, we want to try get_cached again
217 # TODO FIXME look at Python's httplib2 for a most likely
218 # better way to handle all this mess
219 if cache.kind_of?(Array)
220 return get_cached(new_loc, readtimeout, opentimeout, max_redir-1, cache[0])
222 return get(new_loc, readtimeout, opentimeout, max_redir-1, cache)
225 warning "Max redirection reached, not going to #{new_loc}"
228 warning "Unknown HTTP redirection #{resp.inspect}"
231 debug "HttpUtil.get return code #{resp.code} #{resp.body}"
233 @last_response = resp
236 rescue StandardError, Timeout::Error => e
237 error "HttpUtil.get exception: #{e.inspect}, while trying to get #{uri}"
238 debug e.backtrace.join("\n")
244 # just like the above, but only gets the head
245 def head(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"])
246 if uri_or_str.kind_of?(URI)
249 uri = URI.parse(uri_or_str.to_s)
252 proxy = get_proxy(uri)
253 proxy.open_timeout = opentimeout
254 proxy.read_timeout = readtimeout
257 proxy.start() {|http|
258 yield uri.request_uri() if block_given?
259 req = Net::HTTP::Head.new(uri.request_uri(), @headers)
260 if uri.user and uri.password
261 req.basic_auth(uri.user, uri.password)
263 resp = http.request(req)
265 when Net::HTTPSuccess
267 when Net::HTTPRedirection
268 debug "Redirecting #{uri} to #{resp['location']}"
269 yield resp['location'] if block_given?
271 return head( URI.parse(resp['location']), readtimeout, opentimeout, max_redir-1)
273 warning "Max redirection reached, not going to #{resp['location']}"
276 debug "HttpUtil.head return code #{resp.code}"
278 @last_response = resp
281 rescue StandardError, Timeout::Error => e
282 error "HttpUtil.head exception: #{e.inspect}, while trying to get #{uri}"
283 debug e.backtrace.join("\n")
289 # uri:: uri to query (Uri object or String)
290 # opts:: options. Currently used:
291 # :open_timeout:: open timeout for the proxy
292 # :read_timeout:: read timeout for the proxy
293 # :cache:: should we cache results?
295 # This method is used to get responses following redirections.
297 # It will return either a Net::HTTPResponse or an error.
299 # If a block is given, it will yield the response or error instead of
302 def get_response(uri_or_str, opts={}, &block)
303 if uri_or_str.kind_of?(URI)
306 uri = URI.parse(uri_or_str.to_s)
308 debug "Getting #{uri}"
313 :max_redir => @bot.config["http.max_redir"],
318 cache = options[:cache]
320 proxy = get_proxy(uri)
321 proxy.open_timeout = options[:open_timeout]
322 proxy.read_timeout = options[:read_timeout]
325 proxy.start() {|http|
326 req = Net::HTTP::Get.new(uri.request_uri(), @headers)
327 if uri.user and uri.password
328 req.basic_auth(uri.user, uri.password)
330 http.request(req) { |resp|
332 when Net::HTTPSuccess
334 debug "Caching #{uri.to_s}"
335 cache_response(uri.to_s, resp)
337 when Net::HTTPRedirection
338 if resp.key?('location')
339 new_loc = URI.join(uri, resp['location']) rescue URI.parse(resp['location'])
340 debug "Redirecting #{uri} to #{new_loc}"
341 if options[:max_redir] > 0
342 new_opts = options.dup
343 new_opts[:max_redir] -= 1
344 return get_response(new_loc, new_opts, &block)
346 raise "Too many redirections"
357 rescue StandardError, Timeout::Error => e
358 error "HttpUtil.get_response exception: #{e.inspect}, while trying to get #{uri}"
359 debug e.backtrace.join("\n")
370 raise "This shouldn't happen"
373 def cache_response(k, resp)
375 if resp.key?('pragma') and resp['pragma'] == 'no-cache'
376 debug "Not caching #{k}, it has Pragma: no-cache"
379 # TODO should we skip caching if neither last-modified nor etag are present?
384 u[:last_modified] = nil
385 u[:last_modified] = Time.httpdate(resp['date']) if resp.key?('date')
386 u[:last_modified] = Time.httpdate(resp['last-modified']) if resp.key?('last-modified')
388 u[:expires] = Time.httpdate(resp['expires']) if resp.key?('expires')
389 u[:revalidate] = false
390 if resp.key?('cache-control')
392 case resp['cache-control']
393 when /no-cache|must-revalidate/
394 u[:revalidate] = true
398 u[:etag] = resp['etag'] if resp.key?('etag')
403 error "Failed to cache #{k}/#{resp.to_hash.inspect}: #{e.inspect}"
407 debug "Cached #{k}/#{resp.to_hash.inspect}: #{u.inspect_no_body}"
408 debug "#{@cache.size} pages (#{@cache.keys.join(', ')}) cached up to now"
411 # For debugging purposes
420 def expired?(uri, readtimeout, opentimeout)
422 debug "Checking cache validity for #{k}"
424 return true unless @cache.key?(k)
427 # TODO we always revalidate for the time being
429 if u[:etag].empty? and u[:last_modified].nil?
434 proxy = get_proxy(uri)
435 proxy.open_timeout = opentimeout
436 proxy.read_timeout = readtimeout
438 proxy.start() {|http|
439 yield uri.request_uri() if block_given?
440 headers = @headers.dup
441 headers['If-None-Match'] = u[:etag] unless u[:etag].empty?
442 headers['If-Modified-Since'] = u[:last_modified].rfc2822 if u[:last_modified]
443 debug "Cache HEAD request headers: #{headers.inspect}"
444 # FIXME TODO We might want to use a Get here
445 # because if a 200 OK is returned we would get the new body
446 # with one connection less ...
447 req = Net::HTTP::Head.new(uri.request_uri(), headers)
448 if uri.user and uri.password
449 req.basic_auth(uri.user, uri.password)
451 resp = http.request(req)
452 debug "Checking cache validity of #{u.inspect_no_body} against #{resp.inspect}/#{resp.to_hash.inspect}"
454 when Net::HTTPNotModified
461 error "Failed to check cache validity for #{uri}: #{e.inspect}"
466 # gets a page from the cache if it's still (assumed to be) valid
467 # TODO remove stale cached pages, except when called with noexpire=true
468 def get_cached(uri_or_str, readtimeout=10, opentimeout=5,
469 max_redir=@bot.config['http.max_redir'],
470 noexpire=@bot.config['http.no_expire_cache'])
471 if uri_or_str.kind_of?(URI)
474 uri = URI.parse(uri_or_str.to_s)
476 debug "Getting cached #{uri}"
478 if expired?(uri, readtimeout, opentimeout)
479 debug "Cache expired"
480 bod = get(uri, readtimeout, opentimeout, max_redir, [noexpire])
481 bod.instance_variable_set(:@cached,false)
485 @cache[k][:count] += 1
486 @cache[k][:last_use] = Time.now
487 bod = @cache[k][:body]
488 bod.instance_variable_set(:@cached,true)
493 unless bod.respond_to?(:cached?)
501 # We consider a page to be manually expired if it has no
502 # etag and no last-modified and if any of the expiration
503 # conditions are met (expire_time, max_cache_time, Expires)
504 def manually_expired?(hash, time)
505 auto = hash[:etag].empty? and hash[:last_modified].nil?
507 manual = (time - hash[:last_use] > @bot.config['http.expire_time']*60) or
508 (time - hash[:first_use] > @bot.config['http.max_cache_time']*60) or
509 (hash[:expires] < time)
510 return (auto and manual)
513 def remove_stale_cache
514 debug "Removing stale cache"
515 debug "#{@cache.size} pages before"
518 @cache.reject! { |k, val|
519 manually_expired?(val, now)
522 error "Failed to remove stale cache: #{e.inspect}"
524 debug "#{@cache.size} pages after"