utils: support hex HTML entities

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index 4cd0dceb08d67a363f201690f8d2436d1140f989..7b316ffe28cd3df6ffc6ecd0d1707063142dd1bb 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -16,15 +16,6 @@ require 'set'
  begin
    require 'htmlentities'
  rescue LoadError
-  gems = nil
-  begin
-    gems = require 'rubygems'
-  rescue LoadError
-    gems = false
-  end
-  if gems
-    retry
-  else
      module ::Irc
        module Utils
          UNESCAPE_TABLE = {
@@ -32,6 +23,7 @@ rescue LoadError
      'raquo' => '»',
      'quot' => '"',
      'apos' => '\'',
+    'deg' => '°',
      'micro' => 'µ',
      'copy' => '©',
      'trade' => '™',
@@ -41,6 +33,7 @@ rescue LoadError
      'gt' => '>',
      'hellip' => '…',
      'nbsp' => ' ',
+    'ndash' => '–',
      'Agrave' => 'À',
      'Aacute' => 'Á',
      'Acirc' => 'Â',
@@ -108,7 +101,6 @@ rescue LoadError
          }
        end
      end
-  end
  end
  
  begin
@@ -121,15 +113,6 @@ begin
      end
    end
  rescue LoadError
-  gems = nil
-  begin
-    gems = require 'rubygems'
-  rescue LoadError
-    gems = false
-  end
-  if gems
-    retry
-  else
      module ::Irc
        module Utils
          # Some regular expressions to manage HTML data
@@ -144,13 +127,12 @@ rescue LoadError
  
          # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
          # to mark actual text
-        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
  
          # At worst, we can try stuff which is comprised between two <br>
          AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
        end
      end
-  end
  end
  
  module ::Irc
@@ -169,7 +151,7 @@ module ::Irc
      def Utils.bot=(b)
        debug "initializing utils"
        @@bot = b
-      @@safe_save_dir = "#{@@bot.botclass}/safe_save"
+      @@safe_save_dir = @@bot.path('safe_save')
      end
  
  
@@ -287,32 +269,55 @@ module ::Irc
          _("%{m} minutes") % { :m => secs/SEC_PER_MIN }
        when secs > 1
          _("%{m} seconds") % { :m => secs }
-      else 
+      else
          _("one second")
        end
      end
  
      # Execute an external program, returning a String obtained by redirecting
-    # the program's standards errors and output 
+    # the program's standards errors and output
      #
+    # TODO: find a way to expose some common errors (e.g. Errno::NOENT)
+    # to the caller
      def Utils.safe_exec(command, *args)
-      IO.popen("-") { |p|
+      output = IO.popen("-") { |p|
          if p
-          return p.readlines.join("\n")
+          break p.readlines.join("\n")
          else
            begin
              $stderr.reopen($stdout)
              exec(command, *args)
            rescue Exception => e
-            puts "exec of #{command} led to exception: #{e.pretty_inspect}"
-            Kernel::exit! 0
+            puts "exception #{e.pretty_inspect} trying to run #{command}"
+            Kernel::exit! 1
            end
            puts "exec of #{command} failed"
-          Kernel::exit! 0
+          Kernel::exit! 1
          end
        }
+      raise "safe execution of #{command} returned #{$?}" unless $?.success?
+      return output
      end
  
+    # Try executing an external program, returning true if the run was successful
+    # and false otherwise
+    def Utils.try_exec(command, *args)
+      IO.popen("-") { |p|
+        if p.nil?
+          begin
+            $stderr.reopen($stdout)
+            exec(command, *args)
+          rescue Exception => e
+            Kernel::exit! 1
+          end
+          Kernel::exit! 1
+        else
+          debug p.readlines
+        end
+      }
+      debug $?
+      return $?.success?
+    end
  
      # Safely (atomically) save to _file_, by passing a tempfile to the block
      # and then moving the tempfile to its final location when done.
@@ -333,14 +338,27 @@ module ::Irc
      # Decode HTML entities in the String _str_, using HTMLEntities if the
      # package was found, or UNESCAPE_TABLE otherwise.
      #
-    def Utils.decode_html_entities(str)
-      if defined? ::HTMLEntities
-        return HTMLEntities.decode_entities(str)
+
+    if defined? ::HTMLEntities
+      if ::HTMLEntities.respond_to? :decode_entities
+        def Utils.decode_html_entities(str)
+          return HTMLEntities.decode_entities(str)
+        end
        else
-        str.gsub(/(&(.+?);)/) {
+        @@html_entities = HTMLEntities.new
+        def Utils.decode_html_entities(str)
+          return @@html_entities.decode str
+        end
+      end
+    else
+      def Utils.decode_html_entities(str)
+        return str.gsub(/(&(.+?);)/) {
            symbol = $2
            # remove the 0-paddng from unicode integers
-          if symbol =~ /^#(\d+)$/
+          case symbol
+          when /^#x([0-9a-fA-F]+)$/
+            symbol = $1.to_i(16).to_s
+          when /^#(\d+)$/
              symbol = $1.to_i.to_s
            end
  
@@ -478,7 +496,11 @@ module ::Irc
  
      # HTML first par grabber without hpricot
      def Utils.ircify_first_html_par_woh(xml_org, opts={})
-      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+      xml = xml_org.gsub(/<!--.*?-->/m,
+                         "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im,
+                         "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im,
+                         "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im,
+                         "")
  
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -632,7 +654,7 @@ module ::Irc
      # returns non-nil, its results are merged in _ds_ and returned. Otherwise
      # nil is returned.
      #
-    # The input DataStream shuold have the downloaded HTML as primary key
+    # The input DataStream should have the downloaded HTML as primary key
      # (:text) and possibly a :headers key holding the resonse headers.
      #
      def Utils.try_htmlinfo_filters(ds)
@@ -641,7 +663,7 @@ module ::Irc
        cur = nil
        # TODO filter priority
        filters.each { |n|
-        debug "testing filter #{n}"
+        debug "testing htmlinfo filter #{n}"
          cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
          debug "returned #{cur.pretty_inspect}"
          break if cur
@@ -723,6 +745,20 @@ module ::Irc
        return retval
      end
  
+    # Returns a comma separated list except for the last element
+    # which is joined in with specified conjunction
+    #
+    def Utils.comma_list(words, options={})
+      defaults = { :join_with => ", ", :join_last_with => _(" and ") }
+      opts = defaults.merge(options)
+
+      if words.size < 2
+        words.last
+      else
+        [words[0..-2].join(opts[:join_with]), words.last].join(opts[:join_last_with])
+      end
+    end
+
    end
  end