end
end
-begin
- require 'iconv'
- $we_have_iconv = true
-rescue LoadError
- $we_have_iconv = false
-end
-
# Extensions to the String class
#
# TODO make ircify_html() accept an Hash of options, and make riphtml() just
#
class ::String
- # This method will try to transcode a String supposed to hold an XML or HTML
- # document from the original charset to UTF-8.
- #
- # To find the original encoding, it will first see if the String responds to
- # #http_headers(), and if it does it will assume that the charset indicated
- # there is the correct one. Otherwise, it will try to detect the charset from
- # some typical XML and HTML headers
- def utfy_xml
- return self unless $we_have_iconv
-
- charset = nil
-
- if self.respond_to?(:http_headers) and headers = self.http_headers
- if headers['content-type'].first.match(/charset=(\S+?)\s*(?:;|\Z)/i)
- debug "charset #{charset} set from header"
- charset = $1
- end
- end
-
- if not charset
- case self
- when /<\?xml.*encoding="(\S+)".*\?>/i
- charset = $1
- when /<meta\s+http-equiv\s*=\s*["']?Content-Type["']?.*charset\s*=\s*(\S+?)(?:;|["']|\s).*>/i
- charset = $1
- end
- debug "charset #{charset} set from string"
- end
-
- if charset
- return Iconv.iconv('utf-8', charset, self).join rescue self
- else
- debug "Couldn't find charset for #{self.inspect}"
- return self
- end
-
- end
-
# This method will return a purified version of the receiver, with all HTML
# stripped off and some of it converted to IRC formatting
#
require 'resolv'
require 'net/http'
+require 'iconv'
begin
require 'net/https'
rescue LoadError => e
module ::Net
class HTTPResponse
+ attr_accessor :no_cache
+ if !instance_methods.include?('raw_body')
+ alias :raw_body :body
+ end
+
+ def body_charset(str=self.raw_body)
+ ctype = self['content-type'] || 'text/html'
+ return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i
+
+ charset = 'latin1' # should be in config
+
+ if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i)
+ charset = $1
+ debug "charset #{charset} set from header"
+ end
+
+ case str
+ when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i
+ charset = $1
+ debug "xml charset #{charset} set from xml pi"
+ when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i
+ meta = $1
+ if meta =~ /charset=['"]?([^\s'";]+)['"]?/
+ charset = $1
+ debug "html charset #{charset} set from meta"
+ end
+ end
+ return charset
+ end
+
+ def body_to_utf(str)
+ charset = self.body_charset(str) or return str
+
+ begin
+ return Iconv.iconv('utf-8//ignore', charset, str).first
+ rescue
+ debug "conversion failed"
+ return str
+ end
+ end
+
+ def body
+ return self.body_to_utf(self.raw_body)
+ end
+
# Read chunks from the body until we have at least _size_ bytes, yielding
# the partial text at each chunk. Return the partial body.
def partial_body(size=0, &block)
+ self.no_cache = true
partial = String.new
self.read_body { |chunk|
partial << chunk
- yield partial if block_given?
+ yield self.body_to_utf(partial) if block_given?
break if size and size > 0 and partial.length >= size
}
- return partial
+ return self.body_to_utf(partial)
end
end
end
def self.maybe_new(resp)
debug "maybe new #{resp}"
+ return nil if resp.no_cache
return nil unless Net::HTTPOK === resp ||
Net::HTTPMovedPermanently === resp ||
Net::HTTPFound === resp ||
@response = resp
begin
self.revalidate
- self.response.body
+ self.response.raw_body
rescue Exception => e
error e.message
error e.backtrace.join("\n")
if block_given?
yield(resp)
else
- resp.body
+ # Net::HTTP wants us to read the whole body here
+ resp.raw_body
end
-
- class << resp.body
- def http_headers
- if defined?(@http_headers)
- @http_headers
- else
- nil
- end
- end
-
- def http_headers=(rsp)
- @http_headers=rsp
- end
- end
-
- resp.body.http_headers = resp.to_hash
-
return resp
end
elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp
debug "http error, deleting cached obj" if cached
@cache.delete(cache_key)
- elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil
- debug "storing to cache"
- @cache[cache_key] = cached
+ elsif opts[:cache]
+ begin
+ return handle_response(uri, resp, opts, &block)
+ ensure
+ if cached = CachedObject.maybe_new(resp) rescue nil
+ debug "storing to cache"
+ @cache[cache_key] = cached
+ end
+ end
+ return ret
end
return handle_response(uri, resp, opts, &block)
end
# * :min_spaces => Minimum number of spaces a paragraph should have
#
def Utils.ircify_first_html_par(xml_org, opts={})
- xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "").utfy_xml
+ xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)