lib/rbot/core/utils/httputil.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: rbot HTTP provider
   5 #
   6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
   7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
   8 #
   9 # Copyright:: (C) 2002-2005 Tom Gilbert
  10 # Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta
  11 # Copyright:: (C) 2006,2007 Giuseppe Bilotta
  12
  13 require 'resolv'
  14 require 'net/http'
  15 begin
  16   require 'net/https'
  17 rescue LoadError => e
  18   error "Couldn't load 'net/https':  #{e.inspect}"
  19   error "Secured HTTP connections will fail"
  20 end
  21
  22 module ::Net
  23   class HTTPResponse
  24     # Read chunks from the body until we have at least _size_ bytes, yielding
  25     # the partial text at each chunk. Return the partial body.
  26     def partial_body(size=0, &block)
  27
  28       partial = String.new
  29
  30       self.read_body { |chunk|
  31         partial << chunk
  32         yield partial if block_given?
  33         break if size and size > 0 and partial.length >= size
  34       }
  35
  36       return partial
  37     end
  38   end
  39 end
  40
  41 Net::HTTP.version_1_2
  42
  43 module ::Irc
  44 module Utils
  45
  46 # class for making http requests easier (mainly for plugins to use)
  47 # this class can check the bot proxy configuration to determine if a proxy
  48 # needs to be used, which includes support for per-url proxy configuration.
  49 class HttpUtil
  50     BotConfig.register BotConfigBooleanValue.new('http.use_proxy',
  51       :default => false, :desc => "should a proxy be used for HTTP requests?")
  52     BotConfig.register BotConfigStringValue.new('http.proxy_uri', :default => false,
  53       :desc => "Proxy server to use for HTTP requests (URI, e.g http://proxy.host:port)")
  54     BotConfig.register BotConfigStringValue.new('http.proxy_user',
  55       :default => nil,
  56       :desc => "User for authenticating with the http proxy (if required)")
  57     BotConfig.register BotConfigStringValue.new('http.proxy_pass',
  58       :default => nil,
  59       :desc => "Password for authenticating with the http proxy (if required)")
  60     BotConfig.register BotConfigArrayValue.new('http.proxy_include',
  61       :default => [],
  62       :desc => "List of regexps to check against a URI's hostname/ip to see if we should use the proxy to access this URI. All URIs are proxied by default if the proxy is set, so this is only required to re-include URIs that might have been excluded by the exclude list. e.g. exclude /.*\.foo\.com/, include bar\.foo\.com")
  63     BotConfig.register BotConfigArrayValue.new('http.proxy_exclude',
  64       :default => [],
  65       :desc => "List of regexps to check against a URI's hostname/ip to see if we should use avoid the proxy to access this URI and access it directly")
  66     BotConfig.register BotConfigIntegerValue.new('http.max_redir',
  67       :default => 5,
  68       :desc => "Maximum number of redirections to be used when getting a document")
  69     BotConfig.register BotConfigIntegerValue.new('http.expire_time',
  70       :default => 60,
  71       :desc => "After how many minutes since last use a cached document is considered to be expired")
  72     BotConfig.register BotConfigIntegerValue.new('http.max_cache_time',
  73       :default => 60*24,
  74       :desc => "After how many minutes since first use a cached document is considered to be expired")
  75     BotConfig.register BotConfigIntegerValue.new('http.no_expire_cache',
  76       :default => false,
  77       :desc => "Set this to true if you want the bot to never expire the cached pages")
  78     BotConfig.register BotConfigIntegerValue.new('http.info_bytes',
  79       :default => 8192,
  80       :desc => "How many bytes to download from a web page to find some information. Set to 0 to let the bot download the whole page.")
  81
  82   def initialize(bot)
  83     @bot = bot
  84     @cache = Hash.new
  85     @headers = {
  86       'User-Agent' => "rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)",
  87     }
  88     @last_response = nil
  89   end
  90   attr_reader :last_response
  91   attr_reader :headers
  92
  93   # if http_proxy_include or http_proxy_exclude are set, then examine the
  94   # uri to see if this is a proxied uri
  95   # the in/excludes are a list of regexps, and each regexp is checked against
  96   # the server name, and its IP addresses
  97   def proxy_required(uri)
  98     use_proxy = true
  99     if @bot.config["http.proxy_exclude"].empty? && @bot.config["http.proxy_include"].empty?
 100       return use_proxy
 101     end
 102
 103     list = [uri.host]
 104     begin
 105       list.concat Resolv.getaddresses(uri.host)
 106     rescue StandardError => err
 107       warning "couldn't resolve host uri.host"
 108     end
 109
 110     unless @bot.config["http.proxy_exclude"].empty?
 111       re = @bot.config["http.proxy_exclude"].collect{|r| Regexp.new(r)}
 112       re.each do |r|
 113         list.each do |item|
 114           if r.match(item)
 115             use_proxy = false
 116             break
 117           end
 118         end
 119       end
 120     end
 121     unless @bot.config["http.proxy_include"].empty?
 122       re = @bot.config["http.proxy_include"].collect{|r| Regexp.new(r)}
 123       re.each do |r|
 124         list.each do |item|
 125           if r.match(item)
 126             use_proxy = true
 127             break
 128           end
 129         end
 130       end
 131     end
 132     debug "using proxy for uri #{uri}?: #{use_proxy}"
 133     return use_proxy
 134   end
 135
 136   # uri:: Uri to create a proxy for
 137   #
 138   # return a net/http Proxy object, which is configured correctly for
 139   # proxying based on the bot's proxy configuration.
 140   # This will include per-url proxy configuration based on the bot config
 141   # +http_proxy_include/exclude+ options.
 142   def get_proxy(uri)
 143     proxy = nil
 144     proxy_host = nil
 145     proxy_port = nil
 146     proxy_user = nil
 147     proxy_pass = nil
 148
 149     if @bot.config["http.use_proxy"]
 150       if (ENV['http_proxy'])
 151         proxy = URI.parse ENV['http_proxy'] rescue nil
 152       end
 153       if (@bot.config["http.proxy_uri"])
 154         proxy = URI.parse @bot.config["http.proxy_uri"] rescue nil
 155       end
 156       if proxy
 157         debug "proxy is set to #{proxy.host} port #{proxy.port}"
 158         if proxy_required(uri)
 159           proxy_host = proxy.host
 160           proxy_port = proxy.port
 161           proxy_user = @bot.config["http.proxy_user"]
 162           proxy_pass = @bot.config["http.proxy_pass"]
 163         end
 164       end
 165     end
 166
 167     h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port)
 168     h.use_ssl = true if uri.scheme == "https"
 169     return h
 170   end
 171
 172   # uri::         uri to query (Uri object)
 173   # readtimeout:: timeout for reading the response
 174   # opentimeout:: timeout for opening the connection
 175   #
 176   # simple get request, returns (if possible) response body following redirs
 177   # and caching if requested
 178   # if a block is given, it yields the urls it gets redirected to
 179   # TODO we really need something to implement proper caching
 180   def get(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"], cache=false)
 181     if uri_or_str.kind_of?(URI)
 182       uri = uri_or_str
 183     else
 184       uri = URI.parse(uri_or_str.to_s)
 185     end
 186     debug "Getting #{uri}"
 187
 188     proxy = get_proxy(uri)
 189     proxy.open_timeout = opentimeout
 190     proxy.read_timeout = readtimeout
 191
 192     begin
 193       proxy.start() {|http|
 194         yield uri.request_uri() if block_given?
 195         req = Net::HTTP::Get.new(uri.request_uri(), @headers)
 196         if uri.user and uri.password
 197           req.basic_auth(uri.user, uri.password)
 198         end
 199         resp = http.request(req)
 200         case resp
 201         when Net::HTTPSuccess
 202           if cache
 203             debug "Caching #{uri.to_s}"
 204             cache_response(uri.to_s, resp)
 205           end
 206           return resp.body
 207         when Net::HTTPRedirection
 208           if resp.key?('location')
 209             new_loc = URI.join(uri, resp['location'])
 210             debug "Redirecting #{uri} to #{new_loc}"
 211             yield new_loc if block_given?
 212             if max_redir > 0
 213               # If cache is an Array, we assume get was called by get_cached
 214               # because of a cache miss and that the first value of the Array
 215               # was the noexpire value. Since the cache miss might have been
 216               # caused by a redirection, we want to try get_cached again
 217               # TODO FIXME look at Python's httplib2 for a most likely
 218               # better way to handle all this mess
 219               if cache.kind_of?(Array)
 220                 return get_cached(new_loc, readtimeout, opentimeout, max_redir-1, cache[0])
 221               else
 222                 return get(new_loc, readtimeout, opentimeout, max_redir-1, cache)
 223               end
 224             else
 225               warning "Max redirection reached, not going to #{new_loc}"
 226             end
 227           else
 228             warning "Unknown HTTP redirection #{resp.inspect}"
 229           end
 230         else
 231           debug "HttpUtil.get return code #{resp.code} #{resp.body}"
 232         end
 233         @last_response = resp
 234         return nil
 235       }
 236     rescue StandardError, Timeout::Error => e
 237       error "HttpUtil.get exception: #{e.inspect}, while trying to get #{uri}"
 238       debug e.backtrace.join("\n")
 239     end
 240     @last_response = nil
 241     return nil
 242   end
 243
 244   # just like the above, but only gets the head
 245   def head(uri_or_str, readtimeout=10, opentimeout=5, max_redir=@bot.config["http.max_redir"])
 246     if uri_or_str.kind_of?(URI)
 247       uri = uri_or_str
 248     else
 249       uri = URI.parse(uri_or_str.to_s)
 250     end
 251
 252     proxy = get_proxy(uri)
 253     proxy.open_timeout = opentimeout
 254     proxy.read_timeout = readtimeout
 255
 256     begin
 257       proxy.start() {|http|
 258         yield uri.request_uri() if block_given?
 259         req = Net::HTTP::Head.new(uri.request_uri(), @headers)
 260         if uri.user and uri.password
 261           req.basic_auth(uri.user, uri.password)
 262         end
 263         resp = http.request(req)
 264         case resp
 265         when Net::HTTPSuccess
 266           return resp
 267         when Net::HTTPRedirection
 268           debug "Redirecting #{uri} to #{resp['location']}"
 269           yield resp['location'] if block_given?
 270           if max_redir > 0
 271             return head( URI.parse(resp['location']), readtimeout, opentimeout, max_redir-1)
 272           else
 273             warning "Max redirection reached, not going to #{resp['location']}"
 274           end
 275         else
 276           debug "HttpUtil.head return code #{resp.code}"
 277         end
 278         @last_response = resp
 279         return nil
 280       }
 281     rescue StandardError, Timeout::Error => e
 282       error "HttpUtil.head exception: #{e.inspect}, while trying to get #{uri}"
 283       debug e.backtrace.join("\n")
 284     end
 285     @last_response = nil
 286     return nil
 287   end
 288
 289   # uri::         uri to query (Uri object or String)
 290   # opts::        options. Currently used:
 291   # :open_timeout::     open timeout for the proxy
 292   # :read_timeout::     read timeout for the proxy
 293   # :cache::            should we cache results?
 294   #
 295   # This method is used to get responses following redirections.
 296   #
 297   # It will return either a Net::HTTPResponse or an error.
 298   #
 299   # If a block is given, it will yield the response or error instead of
 300   # returning it
 301   #
 302   def get_response(uri_or_str, opts={}, &block)
 303     if uri_or_str.kind_of?(URI)
 304       uri = uri_or_str
 305     else
 306       uri = URI.parse(uri_or_str.to_s)
 307     end
 308     debug "Getting #{uri}"
 309
 310     options = {
 311       :read_timeout => 10,
 312       :open_timeout => 5,
 313       :max_redir => @bot.config["http.max_redir"],
 314       :cache => false,
 315       :yield => :none
 316     }.merge(opts)
 317
 318     cache = options[:cache]
 319
 320     proxy = get_proxy(uri)
 321     proxy.open_timeout = options[:open_timeout]
 322     proxy.read_timeout = options[:read_timeout]
 323
 324     begin
 325       proxy.start() {|http|
 326         req = Net::HTTP::Get.new(uri.request_uri(), @headers)
 327         if uri.user and uri.password
 328           req.basic_auth(uri.user, uri.password)
 329         end
 330         http.request(req) { |resp|
 331           case resp
 332           when Net::HTTPSuccess
 333             if cache
 334               debug "Caching #{uri.to_s}"
 335               cache_response(uri.to_s, resp)
 336             end
 337           when Net::HTTPRedirection
 338             if resp.key?('location')
 339               new_loc = URI.join(uri, resp['location']) rescue URI.parse(resp['location'])
 340               debug "Redirecting #{uri} to #{new_loc}"
 341               if options[:max_redir] > 0
 342                 new_opts = options.dup
 343                 new_opts[:max_redir] -= 1
 344                 return get_response(new_loc, new_opts, &block)
 345               else
 346                 raise "Too many redirections"
 347               end
 348             end
 349           end
 350           if block_given?
 351             yield resp
 352           else
 353             return resp
 354           end
 355         }
 356       }
 357     rescue StandardError, Timeout::Error => e
 358       error "HttpUtil.get_response exception: #{e.inspect}, while trying to get #{uri}"
 359       debug e.backtrace.join("\n")
 360       def e.body
 361         nil
 362       end
 363       if block_given?
 364         yield e
 365       else
 366         return e
 367       end
 368     end
 369
 370     raise "This shouldn't happen"
 371   end
 372
 373   def cache_response(k, resp)
 374     begin
 375       if resp.key?('pragma') and resp['pragma'] == 'no-cache'
 376         debug "Not caching #{k}, it has Pragma: no-cache"
 377         return
 378       end
 379       # TODO should we skip caching if neither last-modified nor etag are present?
 380       now = Time.new
 381       u = Hash.new
 382       u = Hash.new
 383       u[:body] = resp.body
 384       u[:last_modified] = nil
 385       u[:last_modified] = Time.httpdate(resp['date']) if resp.key?('date')
 386       u[:last_modified] = Time.httpdate(resp['last-modified']) if resp.key?('last-modified')
 387       u[:expires] = now
 388       u[:expires] = Time.httpdate(resp['expires']) if resp.key?('expires')
 389       u[:revalidate] = false
 390       if resp.key?('cache-control')
 391         # TODO max-age
 392         case resp['cache-control']
 393         when /no-cache|must-revalidate/
 394           u[:revalidate] = true
 395         end
 396       end
 397       u[:etag] = ""
 398       u[:etag] = resp['etag'] if resp.key?('etag')
 399       u[:count] = 1
 400       u[:first_use] = now
 401       u[:last_use] = now
 402     rescue => e
 403       error "Failed to cache #{k}/#{resp.to_hash.inspect}: #{e.inspect}"
 404       return
 405     end
 406     @cache[k] = u
 407     debug "Cached #{k}/#{resp.to_hash.inspect}: #{u.inspect_no_body}"
 408     debug "#{@cache.size} pages (#{@cache.keys.join(', ')}) cached up to now"
 409   end
 410
 411   # For debugging purposes
 412   class ::Hash
 413     def inspect_no_body
 414       temp = self.dup
 415       temp.delete(:body)
 416       temp.inspect
 417     end
 418   end
 419
 420   def expired?(uri, readtimeout, opentimeout)
 421     k = uri.to_s
 422     debug "Checking cache validity for #{k}"
 423     begin
 424       return true unless @cache.key?(k)
 425       u = @cache[k]
 426
 427       # TODO we always revalidate for the time being
 428
 429       if u[:etag].empty? and u[:last_modified].nil?
 430         # TODO max-age
 431         return true
 432       end
 433
 434       proxy = get_proxy(uri)
 435       proxy.open_timeout = opentimeout
 436       proxy.read_timeout = readtimeout
 437
 438       proxy.start() {|http|
 439         yield uri.request_uri() if block_given?
 440         headers = @headers.dup
 441         headers['If-None-Match'] = u[:etag] unless u[:etag].empty?
 442         headers['If-Modified-Since'] = u[:last_modified].rfc2822 if u[:last_modified]
 443         debug "Cache HEAD request headers: #{headers.inspect}"
 444         # FIXME TODO We might want to use a Get here
 445         # because if a 200 OK is returned we would get the new body
 446         # with one connection less ...
 447         req = Net::HTTP::Head.new(uri.request_uri(), headers)
 448         if uri.user and uri.password
 449           req.basic_auth(uri.user, uri.password)
 450         end
 451         resp = http.request(req)
 452         debug "Checking cache validity of #{u.inspect_no_body} against #{resp.inspect}/#{resp.to_hash.inspect}"
 453         case resp
 454         when Net::HTTPNotModified
 455           return false
 456         else
 457           return true
 458         end
 459       }
 460     rescue => e
 461       error "Failed to check cache validity for #{uri}: #{e.inspect}"
 462       return true
 463     end
 464   end
 465
 466   # gets a page from the cache if it's still (assumed to be) valid
 467   # TODO remove stale cached pages, except when called with noexpire=true
 468   def get_cached(uri_or_str, readtimeout=10, opentimeout=5,
 469                  max_redir=@bot.config['http.max_redir'],
 470                  noexpire=@bot.config['http.no_expire_cache'])
 471     if uri_or_str.kind_of?(URI)
 472       uri = uri_or_str
 473     else
 474       uri = URI.parse(uri_or_str.to_s)
 475     end
 476     debug "Getting cached #{uri}"
 477
 478     if expired?(uri, readtimeout, opentimeout)
 479       debug "Cache expired"
 480       bod = get(uri, readtimeout, opentimeout, max_redir, [noexpire])
 481       bod.instance_variable_set(:@cached,false)
 482     else
 483       k = uri.to_s
 484       debug "Using cache"
 485       @cache[k][:count] += 1
 486       @cache[k][:last_use] = Time.now
 487       bod = @cache[k][:body]
 488       bod.instance_variable_set(:@cached,true)
 489     end
 490     unless noexpire
 491       remove_stale_cache
 492     end
 493     unless bod.respond_to?(:cached?)
 494       def bod.cached?
 495         return @cached
 496       end
 497     end
 498     return bod
 499   end
 500
 501   # We consider a page to be manually expired if it has no
 502   # etag and no last-modified and if any of the expiration
 503   # conditions are met (expire_time, max_cache_time, Expires)
 504   def manually_expired?(hash, time)
 505     auto = hash[:etag].empty? and hash[:last_modified].nil?
 506     # TODO max-age
 507     manual = (time - hash[:last_use] > @bot.config['http.expire_time']*60) or
 508              (time - hash[:first_use] > @bot.config['http.max_cache_time']*60) or
 509              (hash[:expires] < time)
 510     return (auto and manual)
 511   end
 512
 513   def remove_stale_cache
 514     debug "Removing stale cache"
 515     debug "#{@cache.size} pages before"
 516     begin
 517     now = Time.new
 518     @cache.reject! { |k, val|
 519        manually_expired?(val, now)
 520     }
 521     rescue => e
 522       error "Failed to remove stale cache: #{e.inspect}"
 523     end
 524     debug "#{@cache.size} pages after"
 525   end
 526 end
 527 end
 528 end