Stringlib/rbot/core/utils/extends.rbutfy_xml() method that tries to transcode a webpage to UTF-8; HTTP headers are attached to bodies returned by HttpUtil methods to ease charset detection

author: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-04-01 16:46:05 +0000
committer: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-04-01 16:46:05 +0000
commit: 8d51c4a1a5a75e8e660f85cce37efcdf993500af (patch)
tree: 8686a777529508c92fdcc16bde45b6daf5d2fe94 /lib/rbot
parent: 84e53ad77fae1fd7e924986c5f36a04115e13ffc (diff)
3 files changed, 60 insertions, 1 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index 0ecf7aa2..95569b71 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -27,6 +27,12 @@ class ::Array
   end
 end
 
+begin
+  require 'iconv'
+  $we_have_iconv = true
+rescue LoadError
+  $we_have_iconv = false
+end
 
 # Extensions to the String class
 #
@@ -35,6 +41,43 @@ end
 #
 class ::String
 
+  # This method will try to transcode a String supposed to hold an XML or HTML
+  # document from the original charset to UTF-8.
+  #
+  # To find the original encoding, it will first see if the String responds to
+  # #http_headers(), and if it does it will assume that the charset indicated
+  # there is the correct one. Otherwise, it will try to detect the charset from
+  # some typical XML and HTML headers
+  def utfy_xml
+    return self unless $we_have_iconv
+
+    charset = nil
+
+    if self.respond_to?(:http_headers) and headers = self.http_headers
+      if headers['content-type'].first.match(/charset="?(\S+?)"?\s*;?/i)
+        charset = $1
+      end
+    end
+
+    if not charset
+      case self
+      when /<\?xml.*encoding="(\S+)".*\?>/i
+        charset = $1
+      when /<meta\s+http-equiv\s*=\s*"Content-Type".*charset\s*=\s*"?(\S+?)"?\s*;?/i
+        charset = $1
+      end
+    end
+
+    if charset
+      debug "charset: #{charset}"
+      return Iconv.iconv('utf-8', charset, self).join rescue self
+    else
+      debug "Couldn't find charset for #{self.inspect}"
+      return self
+    end
+
+  end
+
   # This method will return a purified version of the receiver, with all HTML
   # stripped off and some of it converted to IRC formatting
   #
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb
index 78445abe..78ea9063 100644
--- a/lib/rbot/core/utils/httputil.rb
+++ b/lib/rbot/core/utils/httputil.rb
@@ -301,6 +301,22 @@ class HttpUtil
       resp.body
     end
 
+    class << resp.body
+      def http_headers
+        if defined?(@http_headers)
+          @http_headers
+        else
+          nil
+        end
+      end
+
+      def http_headers=(rsp)
+        @http_headers=rsp
+      end
+    end
+
+    resp.body.http_headers = resp.to_hash
+
     return resp
   end
 
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 717630e3..63cd58da 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -433,7 +433,7 @@ module ::Irc
     #   * :min_spaces => Minimum number of spaces a paragraph should have
     #
     def Utils.ircify_first_html_par(xml_org, opts={})
-      xml = xml_org.gsub(/<!--.*?-->/, '')
+      xml = xml_org.gsub(/<!--.*?-->/, '').utfy_xml
 
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-04-01 16:46:05 +0000
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-04-01 16:46:05 +0000
commit	8d51c4a1a5a75e8e660f85cce37efcdf993500af (patch)
tree	8686a777529508c92fdcc16bde45b6daf5d2fe94 /lib/rbot
parent	84e53ad77fae1fd7e924986c5f36a04115e13ffc (diff)