diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-04-05 20:24:23 +0000 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-04-05 20:24:23 +0000 |
commit | 4e3660831d7f4fbfe58341e9ce95bef620f13d6b (patch) | |
tree | f4565647c778c5cf2d779ebe10bdb8c6aabe7340 | |
parent | 56889ab64a8b60dba2a4aaaf876386841213b14a (diff) |
HttpUtil: try all detected charsets when converting a webpage, until one that works is found
-rw-r--r-- | lib/rbot/core/utils/httputil.rb | 32 |
1 files changed, 17 insertions, 15 deletions
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index f0a09364..476a71c1 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -32,36 +32,38 @@ module ::Net ctype = self['content-type'] || 'text/html' return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i - charset = 'latin1' # should be in config + charsets = ['latin1'] # should be in config if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i) - charset = $1 - debug "charset #{charset} set from header" + charsets << $1 + debug "charset #{charsets.last} added from header" end case str when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i - charset = $1 - debug "xml charset #{charset} set from xml pi" + charsets << $1 + debug "xml charset #{charsets.last} added from xml pi" when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i meta = $1 if meta =~ /charset=['"]?([^\s'";]+)['"]?/ - charset = $1 - debug "html charset #{charset} set from meta" + charsets << $1 + debug "html charset #{charsets.last} added from meta" end end - return charset + return charsets.uniq end def body_to_utf(str) - charset = self.body_charset(str) or return str + charsets = self.body_charset(str) or return str - begin - return Iconv.iconv('utf-8//ignore', charset, str).first - rescue - debug "conversion failed" - return str - end + charsets.reverse_each { |charset| + begin + return Iconv.iconv('utf-8//ignore', charset, str).first + rescue + debug "conversion failed for #{charset}" + end + } + return str end def body |