chucknorris: fix loading

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index c47601c1baf89386d0801fb5d5458670866e7e24..951d0513ddb22974e97580d1b28465abde852998 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -1,3 +1,4 @@
+# encoding: UTF-8
  #-- vim:sw=2:et
  #++
  #
@@ -23,6 +24,7 @@ rescue LoadError
      'raquo' => '»',
      'quot' => '"',
      'apos' => '\'',
+    'deg' => '°',
      'micro' => 'µ',
      'copy' => '©',
      'trade' => '™',
@@ -32,6 +34,7 @@ rescue LoadError
      'gt' => '>',
      'hellip' => '…',
      'nbsp' => ' ',
+    'ndash' => '–',
      'Agrave' => 'À',
      'Aacute' => 'Á',
      'Acirc' => 'Â',
@@ -125,7 +128,7 @@ rescue LoadError
  
          # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
          # to mark actual text
-        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
  
          # At worst, we can try stuff which is comprised between two <br>
          AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
@@ -137,22 +140,6 @@ module ::Irc
  
    # Miscellaneous useful functions
    module Utils
-    @@bot = nil unless defined? @@bot
-    @@safe_save_dir = nil unless defined?(@@safe_save_dir)
-
-    # The bot instance
-    def Utils.bot
-      @@bot
-    end
-
-    # Set up some Utils routines which depend on the associated bot.
-    def Utils.bot=(b)
-      debug "initializing utils"
-      @@bot = b
-      @@safe_save_dir = @@bot.path('safe_save')
-    end
-
-
      # Seconds per minute
      SEC_PER_MIN = 60
      # Seconds per hour
@@ -196,7 +183,7 @@ module ::Irc
        when 0
          raise "Empty ret array!"
        when 1
-        return ret.to_s
+        return ret[0].to_s
        else
          return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
        end
@@ -275,52 +262,73 @@ module ::Irc
      # Execute an external program, returning a String obtained by redirecting
      # the program's standards errors and output
      #
+    # TODO: find a way to expose some common errors (e.g. Errno::NOENT)
+    # to the caller
      def Utils.safe_exec(command, *args)
-      IO.popen("-") { |p|
+      output = IO.popen("-") { |p|
          if p
-          return p.readlines.join("\n")
+          break p.readlines.join("\n")
          else
            begin
              $stderr.reopen($stdout)
              exec(command, *args)
            rescue Exception => e
-            puts "exec of #{command} led to exception: #{e.pretty_inspect}"
-            Kernel::exit! 0
+            puts "exception #{e.pretty_inspect} trying to run #{command}"
+            Kernel::exit! 1
            end
            puts "exec of #{command} failed"
-          Kernel::exit! 0
+          Kernel::exit! 1
          end
        }
+      raise "safe execution of #{command} returned #{$?}" unless $?.success?
+      return output
      end
  
-
-    # Safely (atomically) save to _file_, by passing a tempfile to the block
-    # and then moving the tempfile to its final location when done.
-    #
-    # call-seq: Utils.safe_save(file, &block)
-    #
-    def Utils.safe_save(file)
-      raise 'No safe save directory defined!' if @@safe_save_dir.nil?
-      basename = File.basename(file)
-      temp = Tempfile.new(basename,@@safe_save_dir)
-      temp.binmode
-      yield temp if block_given?
-      temp.close
-      File.rename(temp.path, file)
+    # Try executing an external program, returning true if the run was successful
+    # and false otherwise
+    def Utils.try_exec(command, *args)
+      IO.popen("-") { |p|
+        if p.nil?
+          begin
+            $stderr.reopen($stdout)
+            exec(command, *args)
+          rescue Exception => e
+            Kernel::exit! 1
+          end
+          Kernel::exit! 1
+        else
+          debug p.readlines
+        end
+      }
+      debug $?
+      return $?.success?
      end
  
  
      # Decode HTML entities in the String _str_, using HTMLEntities if the
      # package was found, or UNESCAPE_TABLE otherwise.
      #
-    def Utils.decode_html_entities(str)
-      if defined? ::HTMLEntities
-        return HTMLEntities.decode_entities(str)
+
+    if defined? ::HTMLEntities
+      if ::HTMLEntities.respond_to? :decode_entities
+        def Utils.decode_html_entities(str)
+          return HTMLEntities.decode_entities(str)
+        end
        else
-        str.gsub(/(&(.+?);)/) {
+        @@html_entities = HTMLEntities.new
+        def Utils.decode_html_entities(str)
+          return @@html_entities.decode str
+        end
+      end
+    else
+      def Utils.decode_html_entities(str)
+        return str.gsub(/(&(.+?);)/) {
            symbol = $2
            # remove the 0-paddng from unicode integers
-          if symbol =~ /^#(\d+)$/
+          case symbol
+          when /^#x([0-9a-fA-F]+)$/
+            symbol = $1.to_i(16).to_s
+          when /^#(\d+)$/
              symbol = $1.to_i.to_s
            end
  
@@ -458,7 +466,11 @@ module ::Irc
  
      # HTML first par grabber without hpricot
      def Utils.ircify_first_html_par_woh(xml_org, opts={})
-      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+      xml = xml_org.gsub(/<!--.*?-->/m,
+                         "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im,
+                         "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im,
+                         "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im,
+                         "")
  
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -546,16 +558,16 @@ module ::Irc
      # information is retrieved, and special title/summary
      # extraction routines are used if possible.
      #
-    def Utils.get_html_info(doc, opts={})
+    def Utils.get_html_info(bot, doc, opts={})
        case doc
        when String
          Utils.get_string_html_info(doc, opts)
        when Net::HTTPResponse
-        Utils.get_resp_html_info(doc, opts)
+        Utils.get_resp_html_info(bot, doc, opts)
        when URI
          ret = DataStream.new
-        @@bot.httputil.get_response(doc) { |resp|
-          ret.replace Utils.get_resp_html_info(resp, opts)
+        bot.httputil.get_response(doc) { |resp|
+          ret.replace Utils.get_resp_html_info(bot, resp, opts)
          }
          return ret
        else
@@ -572,7 +584,7 @@ module ::Irc
      # Currently, the only accepted options (in _opts_) are
      # uri_fragment:: the URI fragment of the original request
      # full_body::    get the whole body instead of
-    #                @@bot.config['http.info_bytes'] bytes only
+    #                bot.config['http.info_bytes'] bytes only
      #
      # Returns a DataStream with the following keys:
      # text:: the (partial) body
@@ -583,7 +595,7 @@ module ::Irc
      #   a Hash whose keys are lowercase forms of the HTTP
      #   header fields, and whose values are Arrays.
      #
-    def Utils.get_resp_html_info(resp, opts={})
+    def Utils.get_resp_html_info(bot, resp, opts={})
        case resp
        when Net::HTTPSuccess
          loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
@@ -592,9 +604,9 @@ module ::Irc
          end
          ret = DataStream.new(opts.dup)
          ret[:headers] = resp.to_hash
-        ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes'])
+        ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(bot.config['http.info_bytes'])
  
-        filtered = Utils.try_htmlinfo_filters(ret)
+        filtered = Utils.try_htmlinfo_filters(bot, ret)
  
          if filtered
            return filtered
@@ -615,14 +627,14 @@ module ::Irc
      # The input DataStream should have the downloaded HTML as primary key
      # (:text) and possibly a :headers key holding the resonse headers.
      #
-    def Utils.try_htmlinfo_filters(ds)
-      filters = @@bot.filter_names(:htmlinfo)
+    def Utils.try_htmlinfo_filters(bot, ds)
+      filters = bot.filter_names(:htmlinfo)
        return nil if filters.empty?
        cur = nil
        # TODO filter priority
        filters.each { |n|
          debug "testing htmlinfo filter #{n}"
-        cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
+        cur = bot.filter(bot.global_filter_name(n, :htmlinfo), ds)
          debug "returned #{cur.pretty_inspect}"
          break if cur
        }
@@ -677,7 +689,7 @@ module ::Irc
      # If (optional) _opts_ :message is specified, those paragraphs are
      # echoed as replies to the IRC message passed as _opts_ :message
      #
-    def Utils.get_first_pars(urls, count, opts={})
+    def Utils.get_first_pars(bot, urls, count, opts={})
        idx = 0
        msg = opts[:message]
        retval = Array.new
@@ -686,7 +698,7 @@ module ::Irc
          idx += 1
  
          begin
-          info = Utils.get_html_info(URI.parse(url), opts)
+          info = Utils.get_html_info(bot, URI.parse(url), opts)
  
            par = info[:content]
            retval.push(par)
@@ -719,5 +731,3 @@ module ::Irc
  
    end
  end
-
-Irc::Utils.bot = Irc::Bot::Plugins.manager.bot