+# To handle Gzipped pages
+require 'stringio'
+require 'zlib'
+
+module ::Net
+ class HTTPResponse
+ attr_accessor :no_cache
+ unless method_defined? :raw_body
+ alias :raw_body :body
+ end
+
+ def body_charset(str=self.raw_body)
+ ctype = self['content-type'] || 'text/html'
+ return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i
+
+ charsets = ['ISO-8859-1'] # should be in config
+
+ if ctype.match(/charset=["']?([^\s"']+)["']?/i)
+ charsets << $1
+ debug "charset #{charsets.last} added from header"
+ end
+
+ # str might be invalid utf-8 that will crash on the pattern match:
+ str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+ case str
+ when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i
+ charsets << $1
+ debug "xml charset #{charsets.last} added from xml pi"
+ when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i
+ meta = $1
+ if meta =~ /charset=['"]?([^\s'";]+)['"]?/
+ charsets << $1
+ debug "html charset #{charsets.last} added from meta"
+ end
+ end
+ return charsets.uniq
+ end
+
+ def body_to_utf(str)
+ charsets = self.body_charset(str) or return str
+
+ charsets.reverse_each do |charset|
+ begin
+ debug "try decoding using #{charset}"
+ str.force_encoding(charset)
+ tmp = str.encode('UTF-16le', :invalid => :replace, :replace => '').encode('UTF-8')
+ if tmp
+ str = tmp
+ break
+ end
+ rescue
+ error 'failed to use encoding'
+ error $!
+ end
+ end
+
+ return str
+ end
+
+ def decompress_body(str)
+ method = self['content-encoding']
+ case method
+ when nil
+ return str
+ when /gzip/ # Matches gzip, x-gzip, and the non-rfc-compliant gzip;q=\d sent by some servers
+ debug "gunzipping body"
+ begin
+ return Zlib::GzipReader.new(StringIO.new(str)).read
+ rescue Zlib::Error => e
+ # If we can't unpack the whole stream (e.g. because we're doing a
+ # partial read
+ debug "full gunzipping failed (#{e}), trying to recover as much as possible"
+ ret = ''
+ ret.force_encoding(Encoding::ASCII_8BIT)
+ begin
+ Zlib::GzipReader.new(StringIO.new(str)).each_byte { |byte|
+ ret << byte
+ }
+ rescue
+ end
+ return ret
+ end
+ when 'deflate'
+ debug "inflating body"
+ # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread:
+ # -MAX_WBITS stops zlib from looking for a zlib header
+ inflater = Zlib::Inflate.new(-Zlib::MAX_WBITS)
+ begin
+ return inflater.inflate(str)
+ rescue Zlib::Error => e
+ raise e
+ # TODO
+ # debug "full inflation failed (#{e}), trying to recover as much as possible"
+ end
+ when /^(?:iso-8859-\d+|windows-\d+|utf-8|utf8)$/i
+ # B0rked servers (Freshmeat being one of them) sometimes return the charset
+ # in the content-encoding; in this case we assume that the document has
+ # a standard content-encoding
+ old_hsh = self.to_hash
+ self['content-type']= self['content-type']+"; charset="+method.downcase
+ warning "Charset vs content-encoding confusion, trying to recover: from\n#{old_hsh.pretty_inspect}to\n#{self.to_hash.pretty_inspect}"
+ return str
+ else
+ debug self.to_hash
+ raise "Unhandled content encoding #{method}"
+ end
+ end
+
+ def cooked_body
+ return self.body_to_utf(self.decompress_body(self.raw_body))
+ end
+
+ # Read chunks from the body until we have at least _size_ bytes, yielding
+ # the partial text at each chunk. Return the partial body.
+ def partial_body(size=0, &block)
+
+ partial = String.new
+
+ if @read
+ debug "using body() as partial"
+ partial = self.body
+ yield self.body_to_utf(self.decompress_body(partial)) if block_given?
+ else
+ debug "disabling cache"
+ self.no_cache = true
+ self.read_body { |chunk|
+ partial << chunk
+ yield self.body_to_utf(self.decompress_body(partial)) if block_given?
+ break if size and size > 0 and partial.length >= size
+ }
+ end
+
+ return self.body_to_utf(self.decompress_body(partial))
+ end
+
+ def xpath(path)
+ document = Nokogiri::HTML.parse(self.body)
+ document.xpath(path)
+ end
+
+ def to_json
+ JSON::parse(self.body)
+ end
+ end
+end