Utils: try non-paragraphs if no paragraphs was found

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index c2b7d7b104fa9e8c1c0e9183390aa5b80e58edc4..bbdd462b4b560d38641a3b9566e448e4c08e7e17 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -1,3 +1,17 @@
+#-- vim:sw=2:et
+#++
+#
+# :title: rbot utilities provider
+#
+# Author:: Tom Gilbert <tom@linuxbrit.co.uk>
+# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
+#
+# Copyright:: (C) 2002-2006 Tom Gilbert
+# Copyright:: (C) 2007 Giuseppe Bilotta
+#
+# TODO some of these Utils should be rewritten as extensions to the approriate
+# standard Ruby classes and accordingly be moved to extends.rb
+
  require 'net/http'
  require 'uri'
  require 'tempfile'
@@ -314,6 +328,7 @@ module ::Irc
        secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
        mins, secs = secs.divmod SEC_PER_MIN
        secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
+      secs = secs.to_i
        secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
        case ret.length
        when 0
@@ -416,10 +431,11 @@ module ::Irc
        end
      end
  
-    H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
-    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
+    HX_REGEX = /<h(\d)(?:\s+[^>]*)?>.*?<\/h\1>/im
+    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+    AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
      # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
-    # If possible, grab the one after the first h1 heading
+    # If possible, grab the one after the first heading
      #
      # It is possible to pass some options to determine how the stripping
      # occurs. Currently, only one option is supported:
@@ -431,7 +447,7 @@ module ::Irc
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
  
-      header_found = xml.match(H1_REGEX)
+      header_found = xml.match(HX_REGEX)
        if header_found
          header_found = $'
          debug "Found header: #{header_found[1].inspect}"
@@ -456,6 +472,19 @@ module ::Irc
           txt.sub!(strip, '') if strip
          end
        end
+
+      # Nothing yet ... let's get drastic: we ca
+      if txt.empty?
+       header_found = xml
+        while txt.empty? 
+          candidate = header_found[AFTER_PAR1_REGEX]
+          break unless candidate
+          txt = candidate.ircify_html
+          header_found = $'
+         txt.sub!(strip, '') if strip
+        end
+      end
+
        return txt
      end