X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Futils.rb;h=417622e43ea1cf65e13235971d66a228b4f4bcd6;hb=c7c670947b9ec9129412e05fc7934531c9d132ba;hp=4105fa12db611c0988d5f91d0757c458d44e2168;hpb=87d2affc0fbe7cf864fdad4eca506d9d06a0de0c;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git

diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 4105fa12..417622e4 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
 #-- vim:sw=2:et
 #++
 #
@@ -23,6 +24,7 @@ rescue LoadError
     'raquo' => 'Â»',
     'quot' => '"',
     'apos' => '\'',
+    'deg' => 'Â°',
     'micro' => 'Âµ',
     'copy' => 'Â©',
     'trade' => 'â¢',
@@ -32,6 +34,7 @@ rescue LoadError
     'gt' => '>',
     'hellip' => 'â¦',
     'nbsp' => 'Â ',
+    'ndash' => 'â',
     'Agrave' => 'Ã',
     'Aacute' => 'Ã',
     'Acirc' => 'Ã',
@@ -125,7 +128,7 @@ rescue LoadError
 
         # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
         # to mark actual text
-        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 
         # At worst, we can try stuff which is comprised between two <br>
         AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
@@ -196,7 +199,7 @@ module ::Irc
       when 0
         raise "Empty ret array!"
       when 1
-        return ret.to_s
+        return ret[0].to_s
       else
         return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
       end
@@ -336,14 +339,27 @@ module ::Irc
     # Decode HTML entities in the String _str_, using HTMLEntities if the
     # package was found, or UNESCAPE_TABLE otherwise.
     #
-    def Utils.decode_html_entities(str)
-      if defined? ::HTMLEntities
-        return HTMLEntities.decode_entities(str)
+
+    if defined? ::HTMLEntities
+      if ::HTMLEntities.respond_to? :decode_entities
+        def Utils.decode_html_entities(str)
+          return HTMLEntities.decode_entities(str)
+        end
       else
-        str.gsub(/(&(.+?);)/) {
+        @@html_entities = HTMLEntities.new
+        def Utils.decode_html_entities(str)
+          return @@html_entities.decode str
+        end
+      end
+    else
+      def Utils.decode_html_entities(str)
+        return str.gsub(/(&(.+?);)/) {
           symbol = $2
           # remove the 0-paddng from unicode integers
-          if symbol =~ /^#(\d+)$/
+          case symbol
+          when /^#x([0-9a-fA-F]+)$/
+            symbol = $1.to_i(16).to_s
+          when /^#(\d+)$/
             symbol = $1.to_i.to_s
           end
 
@@ -481,7 +497,11 @@ module ::Irc
 
     # HTML first par grabber without hpricot
     def Utils.ircify_first_html_par_woh(xml_org, opts={})
-      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+      xml = xml_org.gsub(/<!--.*?-->/m,
+                         "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im,
+                         "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im,
+                         "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im,
+                         "")
 
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)