Utils: when looking for the first par in a web page, look after any header, not just h1; also, be stricter on what's included in a paragraph

author: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-03-25 18:16:36 +0000
committer: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-03-25 18:16:36 +0000
commit: c513b0227a88b441500581cff9e7f3f954830d2e (patch)
tree: 9db776532721aa7f9d5a1f1927a6a2ab764e744d
parent: bc9e991b8665fdd8f77a257c5381cf70d015a6ec (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index bd35d8d0..cf16b601 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -431,10 +431,10 @@ module ::Irc
       end
     end
 
-    H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
-    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
+    HX_REGEX = /<h(\d)(?:\s+[^>]*)?>.*?<\/h\1>/im
+    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|(?:div|html|body|table|td|tr)(?:\s+[^>]*)?)>/im
     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
-    # If possible, grab the one after the first h1 heading
+    # If possible, grab the one after the first heading
     #
     # It is possible to pass some options to determine how the stripping
     # occurs. Currently, only one option is supported:
@@ -446,7 +446,7 @@ module ::Irc
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 
-      header_found = xml.match(H1_REGEX)
+      header_found = xml.match(HX_REGEX)
       if header_found
         header_found = $'
         debug "Found header: #{header_found[1].inspect}"
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-03-25 18:16:36 +0000
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-03-25 18:16:36 +0000
commit	c513b0227a88b441500581cff9e7f3f954830d2e (patch)
tree	9db776532721aa7f9d5a1f1927a6a2ab764e744d
parent	bc9e991b8665fdd8f77a257c5381cf70d015a6ec (diff)