diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-04-01 16:46:05 +0000 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-04-01 16:46:05 +0000 |
commit | 8d51c4a1a5a75e8e660f85cce37efcdf993500af (patch) | |
tree | 8686a777529508c92fdcc16bde45b6daf5d2fe94 /lib/rbot | |
parent | 84e53ad77fae1fd7e924986c5f36a04115e13ffc (diff) |
Stringlib/rbot/core/utils/extends.rbutfy_xml() method that tries to transcode a webpage to UTF-8; HTTP headers are attached to bodies returned by HttpUtil methods to ease charset detection
Diffstat (limited to 'lib/rbot')
-rw-r--r-- | lib/rbot/core/utils/extends.rb | 43 | ||||
-rw-r--r-- | lib/rbot/core/utils/httputil.rb | 16 | ||||
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 2 |
3 files changed, 60 insertions, 1 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb index 0ecf7aa2..95569b71 100644 --- a/lib/rbot/core/utils/extends.rb +++ b/lib/rbot/core/utils/extends.rb @@ -27,6 +27,12 @@ class ::Array end end +begin + require 'iconv' + $we_have_iconv = true +rescue LoadError + $we_have_iconv = false +end # Extensions to the String class # @@ -35,6 +41,43 @@ end # class ::String + # This method will try to transcode a String supposed to hold an XML or HTML + # document from the original charset to UTF-8. + # + # To find the original encoding, it will first see if the String responds to + # #http_headers(), and if it does it will assume that the charset indicated + # there is the correct one. Otherwise, it will try to detect the charset from + # some typical XML and HTML headers + def utfy_xml + return self unless $we_have_iconv + + charset = nil + + if self.respond_to?(:http_headers) and headers = self.http_headers + if headers['content-type'].first.match(/charset="?(\S+?)"?\s*;?/i) + charset = $1 + end + end + + if not charset + case self + when /<\?xml.*encoding="(\S+)".*\?>/i + charset = $1 + when /<meta\s+http-equiv\s*=\s*"Content-Type".*charset\s*=\s*"?(\S+?)"?\s*;?/i + charset = $1 + end + end + + if charset + debug "charset: #{charset}" + return Iconv.iconv('utf-8', charset, self).join rescue self + else + debug "Couldn't find charset for #{self.inspect}" + return self + end + + end + # This method will return a purified version of the receiver, with all HTML # stripped off and some of it converted to IRC formatting # diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index 78445abe..78ea9063 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -301,6 +301,22 @@ class HttpUtil resp.body end + class << resp.body + def http_headers + if defined?(@http_headers) + @http_headers + else + nil + end + end + + def http_headers=(rsp) + @http_headers=rsp + end + end + + resp.body.http_headers = resp.to_hash + return resp end diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 717630e3..63cd58da 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -433,7 +433,7 @@ module ::Irc # * :min_spaces => Minimum number of spaces a paragraph should have # def Utils.ircify_first_html_par(xml_org, opts={}) - xml = xml_org.gsub(/<!--.*?-->/, '') + xml = xml_org.gsub(/<!--.*?-->/, '').utfy_xml strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) |