X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Fhttputil.rb;h=fb275547169efefce78632b9eb62d3efc0daf39c;hb=391c2ce19c0954d3ea0f7b6cc17c001d26491ca6;hp=8272057a09f5638ae32e9c4bdb8b6f0fb8a829ee;hpb=68bcb2b77929549d171af3fd86774d325644d45c;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index 8272057a..fb275547 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -1,3 +1,4 @@ +# encoding: UTF-8 #-- vim:sw=2:et #++ # @@ -10,12 +11,11 @@ require 'resolv' require 'net/http' require 'cgi' -require 'iconv' + begin - require 'net/https' + require 'nokogiri' rescue LoadError => e - error "Couldn't load 'net/https': #{e.pretty_inspect}" - error "Secured HTTP connections will fail" + error "No nokogiri library found, some features might not be available!" end # To handle Gzipped pages @@ -25,7 +25,7 @@ require 'zlib' module ::Net class HTTPResponse attr_accessor :no_cache - if !instance_methods.include?('raw_body') + unless method_defined? :raw_body alias :raw_body :body end @@ -33,13 +33,15 @@ module ::Net ctype = self['content-type'] || 'text/html' return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i - charsets = ['latin1'] # should be in config + charsets = ['ISO-8859-1'] # should be in config if ctype.match(/charset=["']?([^\s"']+)["']?/i) charsets << $1 debug "charset #{charsets.last} added from header" end + # str might be invalid utf-8 that will crash on the pattern match: + str.encode!('UTF-8', 'UTF-8', :invalid => :replace) case str when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i charsets << $1 @@ -58,20 +60,20 @@ module ::Net charsets = self.body_charset(str) or return str charsets.reverse_each do |charset| - # XXX: this one is really ugly, but i don't know how to make it better - # -jsn - - 0.upto(5) do |off| - begin - debug "trying #{charset} / offset #{off}" - return Iconv.iconv('utf-8//ignore', - charset, - str.slice(0 .. (-1 - off))).first - rescue - debug "conversion failed for #{charset} / offset #{off}" + begin + debug "try decoding using #{charset}" + str.force_encoding(charset) + tmp = str.encode('UTF-16le', :invalid => :replace, :replace => '').encode('UTF-8') + if tmp + str = tmp + break end + rescue + error 'failed to use encoding' + error $! end end + return str end @@ -88,7 +90,8 @@ module ::Net # If we can't unpack the whole stream (e.g. because we're doing a # partial read debug "full gunzipping failed (#{e}), trying to recover as much as possible" - ret = "" + ret = '' + ret.force_encoding(Encoding::ASCII_8BIT) begin Zlib::GzipReader.new(StringIO.new(str)).each_byte { |byte| ret << byte @@ -99,7 +102,7 @@ module ::Net end when 'deflate' debug "inflating body" - # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread: + # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread: # -MAX_WBITS stops zlib from looking for a zlib header inflater = Zlib::Inflate.new(-Zlib::MAX_WBITS) begin @@ -112,7 +115,7 @@ module ::Net when /^(?:iso-8859-\d+|windows-\d+|utf-8|utf8)$/i # B0rked servers (Freshmeat being one of them) sometimes return the charset # in the content-encoding; in this case we assume that the document has - # a standarc content-encoding + # a standard content-encoding old_hsh = self.to_hash self['content-type']= self['content-type']+"; charset="+method.downcase warning "Charset vs content-encoding confusion, trying to recover: from\n#{old_hsh.pretty_inspect}to\n#{self.to_hash.pretty_inspect}" @@ -149,11 +152,18 @@ module ::Net return self.body_to_utf(self.decompress_body(partial)) end + + def xpath(path) + document = Nokogiri::HTML.parse(self.body) + document.xpath(path) + end + + def to_json + JSON::parse(self.body) + end end end -Net::HTTP.version_1_2 - module ::Irc module Utils @@ -190,7 +200,7 @@ class HttpUtil Bot::Config.register Bot::Config::IntegerValue.new('http.max_cache_time', :default => 60*24, :desc => "After how many minutes since first use a cached document is considered to be expired") - Bot::Config.register Bot::Config::IntegerValue.new('http.no_expire_cache', + Bot::Config.register Bot::Config::BooleanValue.new('http.no_expire_cache', :default => false, :desc => "Set this to true if you want the bot to never expire the cached pages") Bot::Config.register Bot::Config::IntegerValue.new('http.info_bytes', @@ -389,7 +399,7 @@ class HttpUtil end end - h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port) + h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_pass) h.use_ssl = true if uri.scheme == "https" h.read_timeout = opts[:read_timeout] @@ -409,8 +419,28 @@ class HttpUtil if resp.key?('location') raise 'Too many redirections' if opts[:max_redir] <= 0 yield resp if opts[:yield] == :all && block_given? + # some servers actually provide unescaped location, e.g. + # http://ulysses.soup.io/post/60734021/Image%20curve%20ball + # rediects to something like + # http://ulysses.soup.io/post/60734021/Image curve ball?sessid=8457b2a3752085cca3fb1d79b9965446 + # causing the URI parser to (obviously) complain. We cannot just + # escape blindly, as this would make a mess of already-escaped + # locations, so we only do it if the URI.parse fails loc = resp['location'] - new_loc = URI.join(uri.to_s, loc) rescue URI.parse(loc) + escaped = false + debug "redirect location: #{loc.inspect}" + begin + new_loc = URI.join(uri.to_s, loc) rescue URI.parse(loc) + rescue + if escaped + raise $! + else + loc = URI.escape(loc) + escaped = true + debug "escaped redirect location: #{loc.inspect}" + retry + end + end new_opts = opts.dup new_opts[:max_redir] -= 1 case opts[:method].to_s.downcase.intern @@ -418,9 +448,23 @@ class HttpUtil new_opts[:method] = :get end if resp['set-cookie'] - debug "setting cookie #{resp['set-cookie']}" - new_opts[:headers] ||= Hash.new - new_opts[:headers]['Cookie'] = resp['set-cookie'] + debug "set cookie request for #{resp['set-cookie']}" + cookie, cookie_flags = (resp['set-cookie']+'; ').split('; ', 2) + domain = uri.host + cookie_flags.scan(/(\S+)=(\S+);/) { |key, val| + if key.intern == :domain + domain = val + break + end + } + debug "cookie domain #{domain} / #{new_loc.host}" + if new_loc.host.rindex(domain) == new_loc.host.length - domain.length + debug "setting cookie" + new_opts[:headers] ||= Hash.new + new_opts[:headers]['Cookie'] = cookie + else + debug "cookie is for another domain, ignoring" + end end debug "following the redirect to #{new_loc}" return get_response(new_loc, new_opts, &block) @@ -482,6 +526,14 @@ class HttpUtil # def get_response(uri_or_s, options = {}, &block) # :yields: resp uri = uri_or_s.kind_of?(URI) ? uri_or_s : URI.parse(uri_or_s.to_s) + unless URI::HTTP === uri + if uri.scheme + raise "#{uri.scheme.inspect} URI scheme is not supported" + else + raise "don't know what to do with #{uri.to_s.inspect}" + end + end + opts = { :max_redir => @bot.config['http.max_redir'], :yield => :final, @@ -489,9 +541,6 @@ class HttpUtil :method => :GET }.merge(options) - resp = nil - cached = nil - req_class = case opts[:method].to_s.downcase.intern when :head, :"net::http::head" opts[:max_redir] = -1 @@ -524,7 +573,9 @@ class HttpUtil debug "get_response(#{uri}, #{opts.inspect})" - if opts[:cache] && cached = @cache[cache_key] + cached = @cache[cache_key] + + if opts[:cache] && cached debug "got cached" if !cached.expired? debug "using cached" @@ -537,7 +588,10 @@ class HttpUtil headers['Range'] = opts[:range] if opts[:range] headers['Authorization'] = opts[:auth_head] if opts[:auth_head] - cached.setup_headers(headers) if cached && (req_class == Net::HTTP::Get) + if opts[:cache] && cached && (req_class == Net::HTTP::Get) + cached.setup_headers headers + end + req = req_class.new(uri.request_uri, headers) if uri.user && uri.password req.basic_auth(uri.user, uri.password) @@ -547,22 +601,23 @@ class HttpUtil debug "prepared request: #{req.to_hash.inspect}" begin - get_proxy(uri, opts).start do |http| - http.request(req) do |resp| - resp['x-rbot-location'] = uri.to_s - if Net::HTTPNotModified === resp - debug "not modified" - begin - cached.revalidate(resp) - rescue Exception => e - error e + get_proxy(uri, opts).start do |http| + http.request(req) do |resp| + resp['x-rbot-location'] = uri.to_s + if Net::HTTPNotModified === resp + debug "not modified" + begin + cached.revalidate(resp) + rescue Exception => e + error e + end + debug "reusing cached" + resp = cached.response + elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp + debug "http error, deleting cached obj" if cached + @cache.delete(cache_key) end - debug "reusing cached" - resp = cached.response - elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp - debug "http error, deleting cached obj" if cached - @cache.delete(cache_key) - elsif opts[:cache] + begin return handle_response(uri, resp, opts, &block) ensure @@ -571,11 +626,8 @@ class HttpUtil @cache[cache_key] = cached end end - return ret end - return handle_response(uri, resp, opts, &block) end - end rescue Exception => e error e raise e.message @@ -593,7 +645,11 @@ class HttpUtil resp = get_response(uri, options, &block) raise "http error: #{resp}" unless Net::HTTPOK === resp || Net::HTTPPartialContent === resp - return resp.body + if options[:resp] + return resp + else + return resp.body + end rescue Exception => e error e end @@ -630,7 +686,7 @@ class HttpUtil opts = {:method => :post, :body => data, :cache => false}.merge(options) begin resp = get_response(uri, opts, &block) - raise 'http error' unless Net::HTTPOK === resp + raise 'http error' unless Net::HTTPOK === resp or Net::HTTPCreated === resp return resp rescue Exception => e error e