require 'resolv'
require 'net/http'
+require 'cgi'
require 'iconv'
begin
require 'net/https'
rescue LoadError => e
- error "Couldn't load 'net/https': #{e.inspect}"
+ error "Couldn't load 'net/https': #{e.pretty_inspect}"
error "Secured HTTP connections will fail"
end
-module ::Net
- class HTTPResponse
- attr_accessor :no_cache
+# To handle Gzipped pages
+require 'stringio'
+require 'zlib'
+
+module ::Net
+ class HTTPResponse
+ attr_accessor :no_cache
if !instance_methods.include?('raw_body')
alias :raw_body :body
end
def body_to_utf(str)
charsets = self.body_charset(str) or return str
- charsets.reverse_each { |charset|
- begin
- return Iconv.iconv('utf-8//ignore', charset, str).first
- rescue
- debug "conversion failed for #{charset}"
+ charsets.reverse_each do |charset|
+ # XXX: this one is really ugly, but i don't know how to make it better
+ # -jsn
+
+ 0.upto(5) do |off|
+ begin
+ debug "trying #{charset} / offset #{off}"
+ return Iconv.iconv('utf-8//ignore',
+ charset,
+ str.slice(0 .. (-1 - off))).first
+ rescue
+ debug "conversion failed for #{charset} / offset #{off}"
+ end
end
- }
+ end
return str
end
- def body
- return self.body_to_utf(self.raw_body)
+ def decompress_body(str)
+ method = self['content-encoding']
+ case method
+ when nil
+ return str
+ when /gzip/ # Matches gzip, x-gzip, and the non-rfc-compliant gzip;q=\d sent by some servers
+ debug "gunzipping body"
+ begin
+ return Zlib::GzipReader.new(StringIO.new(str)).read
+ rescue Zlib::Error => e
+ # If we can't unpack the whole stream (e.g. because we're doing a
+ # partial read
+ debug "full gunzipping failed (#{e}), trying to recover as much as possible"
+ ret = ""
+ begin
+ Zlib::GzipReader.new(StringIO.new(str)).each_byte { |byte|
+ ret << byte
+ }
+ rescue
+ end
+ return ret
+ end
+ else
+ raise "Unhandled content encoding #{method}"
+ end
+ end
+
+ def cooked_body
+ return self.body_to_utf(self.decompress_body(self.raw_body))
end
- # Read chunks from the body until we have at least _size_ bytes, yielding
- # the partial text at each chunk. Return the partial body.
- def partial_body(size=0, &block)
+ # Read chunks from the body until we have at least _size_ bytes, yielding
+ # the partial text at each chunk. Return the partial body.
+ def partial_body(size=0, &block)
self.no_cache = true
- partial = String.new
+ partial = String.new
- self.read_body { |chunk|
- partial << chunk
- yield self.body_to_utf(partial) if block_given?
- break if size and size > 0 and partial.length >= size
- }
+ self.read_body { |chunk|
+ partial << chunk
+ yield self.body_to_utf(self.decompress_body(partial)) if block_given?
+ break if size and size > 0 and partial.length >= size
+ }
- return self.body_to_utf(partial)
- end
- end
+ return self.body_to_utf(self.decompress_body(partial))
+ end
+ end
end
Net::HTTP.version_1_2
self.revalidate
self.response.raw_body
rescue Exception => e
- error e.message
- error e.backtrace.join("\n")
+ error e
raise e
end
end
@cache = Hash.new
@headers = {
'Accept-Charset' => 'utf-8;q=1.0, *;q=0.8',
+ 'Accept-Encoding' => 'gzip;q=1, identity;q=0.8, *;q=0.2',
'User-Agent' =>
"rbot http util #{$version} (http://linuxbrit.co.uk/rbot/)"
- }
+ }
debug "starting http cache cleanup timer"
@timer = @bot.timer.add(300) {
self.remove_stale_cache unless @bot.config['http.no_expire_cache']
}
- end
+ end
def cleanup
debug 'stopping http cache cleanup timer'
# proxying based on the bot's proxy configuration.
# This will include per-url proxy configuration based on the bot config
# +http_proxy_include/exclude+ options.
-
+
def get_proxy(uri, options = {})
opts = {
:read_timeout => 10,
warning ":| redirect w/o location?"
end
end
+ class << resp
+ undef_method :body
+ alias :body :cooked_body
+ end
if block_given?
yield(resp)
else
#
# Generic http transaction method
#
- # It will return a HTTP::Response object or raise an exception
+ # It will return a Net::HTTPResponse object or raise an exception
#
# If a block is given, it will yield the response (see :yield option)
return handle_response(uri, cached.response, opts, &block)
end
end
-
+
headers = @headers.dup.merge(opts[:headers] || {})
headers['Range'] = opts[:range] if opts[:range]
get_proxy(uri, opts).start do |http|
http.request(req) do |resp|
+ resp['x-rbot-location'] = uri.to_s
if Net::HTTPNotModified === resp
debug "not modified"
begin
cached.revalidate(resp)
rescue Exception => e
- error e.message
- error e.backtrace.join("\n")
+ error e
end
debug "reusing cached"
resp = cached.response
Net::HTTPPartialContent === resp
return resp.body
rescue Exception => e
- error e.message
- error e.backtrace.join("\n")
+ error e
end
return nil
end
Net::HTTPServerError == resp
return resp
rescue Exception => e
- error e.message
- error e.backtrace.join("\n")
+ error e
end
return nil
end
raise 'http error' unless Net::HTTPOK === resp
return resp
rescue Exception => e
- error e.message
- error e.backtrace.join("\n")
+ error e
end
return nil
end
(now - val.last_used > max_last) || (now - val.first_used > max_first)
}
rescue => e
- error "Failed to remove stale cache: #{e.inspect}"
+ error "Failed to remove stale cache: #{e.pretty_inspect}"
end
debug "#{@cache.size} pages after"
end
debug 'shutting down httputil'
@bot.httputil.cleanup
@bot.httputil = nil
+ super
end
end