+# encoding: UTF-8
#-- vim:sw=2:et
#++
#
require 'resolv'
require 'net/http'
require 'cgi'
-require 'iconv'
+
begin
- require 'net/https'
+ require 'nokogiri'
rescue LoadError => e
- error "Couldn't load 'net/https': #{e.pretty_inspect}"
- error "Secured HTTP connections will fail"
+ error "No nokogiri library found, some features might not be available!"
end
# To handle Gzipped pages
module ::Net
class HTTPResponse
attr_accessor :no_cache
- if !instance_methods.include?('raw_body')
+ unless method_defined? :raw_body
alias :raw_body :body
end
ctype = self['content-type'] || 'text/html'
return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i
- charsets = ['latin1'] # should be in config
+ charsets = ['ISO-8859-1'] # should be in config
if ctype.match(/charset=["']?([^\s"']+)["']?/i)
charsets << $1
debug "charset #{charsets.last} added from header"
end
+ # str might be invalid utf-8 that will crash on the pattern match:
+ str.encode!('UTF-8', 'UTF-8', :invalid => :replace)
case str
when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i
charsets << $1
charsets = self.body_charset(str) or return str
charsets.reverse_each do |charset|
- # XXX: this one is really ugly, but i don't know how to make it better
- # -jsn
-
- 0.upto(5) do |off|
- begin
- debug "trying #{charset} / offset #{off}"
- return Iconv.iconv('utf-8//ignore',
- charset,
- str.slice(0 .. (-1 - off))).first
- rescue
- debug "conversion failed for #{charset} / offset #{off}"
+ begin
+ debug "try decoding using #{charset}"
+ str.force_encoding(charset)
+ tmp = str.encode('UTF-16le', :invalid => :replace, :replace => '').encode('UTF-8')
+ if tmp
+ str = tmp
+ break
end
+ rescue
+ error 'failed to use encoding'
+ error $!
end
end
+
return str
end
# If we can't unpack the whole stream (e.g. because we're doing a
# partial read
debug "full gunzipping failed (#{e}), trying to recover as much as possible"
- ret = ""
+ ret = ''
+ ret.force_encoding(Encoding::ASCII_8BIT)
begin
Zlib::GzipReader.new(StringIO.new(str)).each_byte { |byte|
ret << byte
end
when 'deflate'
debug "inflating body"
- # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread:
+ # From http://www.koders.com/ruby/fid927B4382397E5115AC0ABE21181AB5C1CBDD5C17.aspx?s=thread:
# -MAX_WBITS stops zlib from looking for a zlib header
inflater = Zlib::Inflate.new(-Zlib::MAX_WBITS)
begin
# TODO
# debug "full inflation failed (#{e}), trying to recover as much as possible"
end
+ when /^(?:iso-8859-\d+|windows-\d+|utf-8|utf8)$/i
+ # B0rked servers (Freshmeat being one of them) sometimes return the charset
+ # in the content-encoding; in this case we assume that the document has
+ # a standard content-encoding
+ old_hsh = self.to_hash
+ self['content-type']= self['content-type']+"; charset="+method.downcase
+ warning "Charset vs content-encoding confusion, trying to recover: from\n#{old_hsh.pretty_inspect}to\n#{self.to_hash.pretty_inspect}"
+ return str
else
+ debug self.to_hash
raise "Unhandled content encoding #{method}"
end
end
return self.body_to_utf(self.decompress_body(partial))
end
+
+ def xpath(path)
+ document = Nokogiri::HTML.parse(self.body)
+ document.xpath(path)
+ end
+
+ def to_json
+ JSON::parse(self.body)
+ end
end
end
-Net::HTTP.version_1_2
-
module ::Irc
module Utils
Bot::Config.register Bot::Config::IntegerValue.new('http.max_cache_time',
:default => 60*24,
:desc => "After how many minutes since first use a cached document is considered to be expired")
- Bot::Config.register Bot::Config::IntegerValue.new('http.no_expire_cache',
+ Bot::Config.register Bot::Config::BooleanValue.new('http.no_expire_cache',
:default => false,
:desc => "Set this to true if you want the bot to never expire the cached pages")
Bot::Config.register Bot::Config::IntegerValue.new('http.info_bytes',
end
end
- h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_port)
+ h = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port, proxy_user, proxy_pass)
h.use_ssl = true if uri.scheme == "https"
h.read_timeout = opts[:read_timeout]
if resp.key?('location')
raise 'Too many redirections' if opts[:max_redir] <= 0
yield resp if opts[:yield] == :all && block_given?
+ # some servers actually provide unescaped location, e.g.
+ # http://ulysses.soup.io/post/60734021/Image%20curve%20ball
+ # rediects to something like
+ # http://ulysses.soup.io/post/60734021/Image curve ball?sessid=8457b2a3752085cca3fb1d79b9965446
+ # causing the URI parser to (obviously) complain. We cannot just
+ # escape blindly, as this would make a mess of already-escaped
+ # locations, so we only do it if the URI.parse fails
loc = resp['location']
- new_loc = URI.join(uri.to_s, loc) rescue URI.parse(loc)
+ escaped = false
+ debug "redirect location: #{loc.inspect}"
+ begin
+ new_loc = URI.join(uri.to_s, loc) rescue URI.parse(loc)
+ rescue
+ if escaped
+ raise $!
+ else
+ loc = URI.escape(loc)
+ escaped = true
+ debug "escaped redirect location: #{loc.inspect}"
+ retry
+ end
+ end
new_opts = opts.dup
new_opts[:max_redir] -= 1
case opts[:method].to_s.downcase.intern
new_opts[:method] = :get
end
if resp['set-cookie']
- debug "setting cookie #{resp['set-cookie']}"
- new_opts[:headers] ||= Hash.new
- new_opts[:headers]['Cookie'] = resp['set-cookie']
+ debug "set cookie request for #{resp['set-cookie']}"
+ cookie, cookie_flags = (resp['set-cookie']+'; ').split('; ', 2)
+ domain = uri.host
+ cookie_flags.scan(/(\S+)=(\S+);/) { |key, val|
+ if key.intern == :domain
+ domain = val
+ break
+ end
+ }
+ debug "cookie domain #{domain} / #{new_loc.host}"
+ if new_loc.host.rindex(domain) == new_loc.host.length - domain.length
+ debug "setting cookie"
+ new_opts[:headers] ||= Hash.new
+ new_opts[:headers]['Cookie'] = cookie
+ else
+ debug "cookie is for another domain, ignoring"
+ end
end
debug "following the redirect to #{new_loc}"
return get_response(new_loc, new_opts, &block)
#
def get_response(uri_or_s, options = {}, &block) # :yields: resp
uri = uri_or_s.kind_of?(URI) ? uri_or_s : URI.parse(uri_or_s.to_s)
+ unless URI::HTTP === uri
+ if uri.scheme
+ raise "#{uri.scheme.inspect} URI scheme is not supported"
+ else
+ raise "don't know what to do with #{uri.to_s.inspect}"
+ end
+ end
+
opts = {
:max_redir => @bot.config['http.max_redir'],
:yield => :final,
:method => :GET
}.merge(options)
- resp = nil
- cached = nil
-
req_class = case opts[:method].to_s.downcase.intern
when :head, :"net::http::head"
opts[:max_redir] = -1
debug "get_response(#{uri}, #{opts.inspect})"
- if opts[:cache] && cached = @cache[cache_key]
+ cached = @cache[cache_key]
+
+ if opts[:cache] && cached
debug "got cached"
if !cached.expired?
debug "using cached"
headers['Range'] = opts[:range] if opts[:range]
headers['Authorization'] = opts[:auth_head] if opts[:auth_head]
- cached.setup_headers(headers) if cached && (req_class == Net::HTTP::Get)
+ if opts[:cache] && cached && (req_class == Net::HTTP::Get)
+ cached.setup_headers headers
+ end
+
req = req_class.new(uri.request_uri, headers)
if uri.user && uri.password
req.basic_auth(uri.user, uri.password)
debug "prepared request: #{req.to_hash.inspect}"
begin
- get_proxy(uri, opts).start do |http|
- http.request(req) do |resp|
- resp['x-rbot-location'] = uri.to_s
- if Net::HTTPNotModified === resp
- debug "not modified"
- begin
- cached.revalidate(resp)
- rescue Exception => e
- error e
+ get_proxy(uri, opts).start do |http|
+ http.request(req) do |resp|
+ resp['x-rbot-location'] = uri.to_s
+ if Net::HTTPNotModified === resp
+ debug "not modified"
+ begin
+ cached.revalidate(resp)
+ rescue Exception => e
+ error e
+ end
+ debug "reusing cached"
+ resp = cached.response
+ elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp
+ debug "http error, deleting cached obj" if cached
+ @cache.delete(cache_key)
end
- debug "reusing cached"
- resp = cached.response
- elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp
- debug "http error, deleting cached obj" if cached
- @cache.delete(cache_key)
- elsif opts[:cache]
+
begin
return handle_response(uri, resp, opts, &block)
ensure
@cache[cache_key] = cached
end
end
- return ret
end
- return handle_response(uri, resp, opts, &block)
end
- end
rescue Exception => e
error e
raise e.message
resp = get_response(uri, options, &block)
raise "http error: #{resp}" unless Net::HTTPOK === resp ||
Net::HTTPPartialContent === resp
- return resp.body
+ if options[:resp]
+ return resp
+ else
+ return resp.body
+ end
rescue Exception => e
error e
end
opts = {:method => :head}.merge(options)
begin
resp = get_response(uri, opts, &block)
- raise "http error #{resp}" if Net::HTTPClientError === resp ||
- Net::HTTPServerError == resp
+ # raise "http error #{resp}" if Net::HTTPClientError === resp ||
+ # Net::HTTPServerError == resp
return resp
rescue Exception => e
error e
opts = {:method => :post, :body => data, :cache => false}.merge(options)
begin
resp = get_response(uri, opts, &block)
- raise 'http error' unless Net::HTTPOK === resp
+ raise 'http error' unless Net::HTTPOK === resp or Net::HTTPCreated === resp
return resp
rescue Exception => e
error e