Module\#define_structure method: define a new Struct only if doesn't exist already...

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index 047b29d69e7e80b3d48ed74805988409ea1c55de..23d50c314de645b95d19096833b38f3bf2fba542 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -12,15 +12,20 @@
  # TODO some of these Utils should be rewritten as extensions to the approriate
  # standard Ruby classes and accordingly be moved to extends.rb
  
-require 'net/http'
-require 'uri'
  require 'tempfile'
+require 'set'
  
  begin
    require 'htmlentities'
    $we_have_html_entities_decoder = true
  rescue LoadError
-  if require 'rubygems' rescue false
+  gems = nil
+  begin
+    gems = require 'rubygems'
+  rescue LoadError
+    gems = false
+  end
+  if gems
      retry
    else
      $we_have_html_entities_decoder = false
@@ -304,6 +309,20 @@ module ::Irc
  
    # miscellaneous useful functions
    module Utils
+    @@bot = nil unless defined? @@bot
+    @@safe_save_dir = nil unless defined?(@@safe_save_dir)
+
+    def Utils.bot
+      @@bot
+    end
+
+    def Utils.bot=(b)
+      debug "initializing utils"
+      @@bot = b
+      @@safe_save_dir = "#{@@bot.botclass}/safe_save"
+    end
+
+
      SEC_PER_MIN = 60
      SEC_PER_HR = SEC_PER_MIN * 60
      SEC_PER_DAY = SEC_PER_HR * 24
@@ -352,10 +371,10 @@ module ::Irc
            return p.readlines.join("\n")
          else
            begin
-            $stderr = $stdout
+            $stderr.reopen($stdout)
              exec(command, *args)
            rescue Exception => e
-            puts "exec of #{command} led to exception: #{e.inspect}"
+            puts "exec of #{command} led to exception: #{e.pretty_inspect}"
              Kernel::exit! 0
            end
            puts "exec of #{command} failed"
@@ -365,11 +384,6 @@ module ::Irc
      end
  
  
-    @@safe_save_dir = nil unless defined?(@@safe_save_dir)
-    def Utils.set_safe_save_dir(str)
-      @@safe_save_dir = str.dup
-    end
-
      def Utils.safe_save(file)
        raise 'No safe save directory defined!' if @@safe_save_dir.nil?
        basename = File.basename(file)
@@ -381,44 +395,6 @@ module ::Irc
      end
  
  
-    # returns a string containing the result of an HTTP GET on the uri
-    def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
-
-      # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
-      Net::HTTP.version_1_2
-      # (so we support the 1_1 api anyway, avoids problems)
-
-      uri = URI.parse uristr
-      query = uri.path
-      if uri.query
-        query += "?#{uri.query}"
-      end
-
-      proxy_host = nil
-      proxy_port = nil
-      if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
-        proxy_host = proxy_uri.host
-        proxy_port = proxy_uri.port
-      end
-
-      begin
-        http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
-        http.open_timeout = opentimeout
-        http.read_timeout = readtimeout
-
-        http.start {|http|
-          resp = http.get(query)
-          if resp.code == "200"
-            return resp.body
-          end
-        }
-      rescue => e
-        # cheesy for now
-        error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
-        return nil
-      end
-    end
-
      def Utils.decode_html_entities(str)
        if $we_have_html_entities_decoder
          return HTMLEntities.decode_entities(str)
@@ -439,9 +415,12 @@ module ::Irc
      HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
      PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
  
-    # Some blogging and forum platforms use spans or divs with a 'body' in their class
+    # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
      # to mark actual text
-    AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+    AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+    # At worst, we can try stuff which is comprised between two <br>
+    AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
  
      # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
      # If possible, grab the one after the first heading
@@ -452,8 +431,8 @@ module ::Irc
      #               text
      #   * :min_spaces => Minimum number of spaces a paragraph should have
      #
-    def Utils.ircify_first_html_par(xml, opts={})
-      txt = String.new
+    def Utils.ircify_first_html_par(xml_org, opts={})
+      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
  
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -461,6 +440,8 @@ module ::Irc
        min_spaces = opts[:min_spaces] || 8
        min_spaces = 0 if min_spaces < 0
  
+      txt = String.new
+
        while true
          debug "Minimum number of spaces: #{min_spaces}"
          header_found = xml.match(HX_REGEX)
@@ -495,6 +476,8 @@ module ::Irc
          # Nothing yet ... let's get drastic: we look for non-par elements too,
          # but only for those that match something that we know is likely to
          # contain text
+
+        # Attempt #1
          header_found = xml
          while txt.empty? or txt.count(" ") < min_spaces
            candidate = header_found[AFTER_PAR1_REGEX]
@@ -502,7 +485,20 @@ module ::Irc
            txt = candidate.ircify_html
            header_found = $'
            txt.sub!(strip, '') if strip
-          debug "(other attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+          debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+        end
+
+        return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+        # Attempt #2
+        header_found = xml
+        while txt.empty? or txt.count(" ") < min_spaces
+          candidate = header_found[AFTER_PAR2_REGEX]
+          break unless candidate
+          txt = candidate.ircify_html
+          header_found = $'
+          txt.sub!(strip, '') if strip
+          debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
          end
  
          debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
@@ -512,19 +508,22 @@ module ::Irc
      end
  
      # Get the first pars of the first _count_ _urls_.
-    # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
-    # and echoed as replies to the IRC message passed as _opts_ :message.
+    # The pages are downloaded using the bot httputil service.
+    # Returns an array of the first paragraphs fetched.
+    # If (optional) _opts_ :message is specified, those paragraphs are
+    # echoed as replies to the IRC message passed as _opts_ :message
      #
      def Utils.get_first_pars(urls, count, opts={})
        idx = 0
        msg = opts[:message]
+      retval = Array.new
        while count > 0 and urls.length > 0
          url = urls.shift
          idx += 1
  
          # FIXME what happens if some big file is returned? We should share
          # code with the url plugin to only retrieve partial file content!
-        xml = opts[:http_util].get_cached(url)
+        xml = self.bot.httputil.get(url)
          if xml.nil?
            debug "Unable to retrieve #{url}"
            next
@@ -535,13 +534,19 @@ module ::Irc
            # FIXME only do this if the 'url' plugin is loaded
            # TODO even better, put the code here
            # par = @bot.plugins['url'].get_title_from_html(xml)
-          next if par.empty?
+          if par.empty?
+            retval.push(nil)
+            next
+          end
          end
          msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
          count -=1
+        retval.push(par)
        end
+      return retval
      end
  
-
    end
  end
+
+Irc::Utils.bot = Irc::Plugins.manager.bot