From c513b0227a88b441500581cff9e7f3f954830d2e Mon Sep 17 00:00:00 2001
From: Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Date: Sun, 25 Mar 2007 18:16:36 +0000
Subject: Utils: when looking for the first par in a web page, look after any
 header, not just h1; also, be stricter on what's included in a paragraph

---
 lib/rbot/core/utils/utils.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/rbot')
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index bd35d8d0..cf16b601 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -431,10 +431,10 @@ module ::Irc
       end
     end
 
-    H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
-    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
+    HX_REGEX = /<h(\d)(?:\s+[^>]*)?>.*?<\/h\1>/im
+    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|(?:div|html|body|table|td|tr)(?:\s+[^>]*)?)>/im
     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
-    # If possible, grab the one after the first h1 heading
+    # If possible, grab the one after the first heading
     #
     # It is possible to pass some options to determine how the stripping
     # occurs. Currently, only one option is supported:
@@ -446,7 +446,7 @@ module ::Irc
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 
-      header_found = xml.match(H1_REGEX)
+      header_found = xml.match(HX_REGEX)
       if header_found
         header_found = $'
         debug "Found header: #{header_found[1].inspect}"
-- 
cgit v1.2.3