diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-03-25 18:16:36 +0000 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-03-25 18:16:36 +0000 |
commit | c513b0227a88b441500581cff9e7f3f954830d2e (patch) | |
tree | 9db776532721aa7f9d5a1f1927a6a2ab764e744d | |
parent | bc9e991b8665fdd8f77a257c5381cf70d015a6ec (diff) |
Utils: when looking for the first par in a web page, look after any header, not just h1; also, be stricter on what's included in a paragraph
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index bd35d8d0..cf16b601 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -431,10 +431,10 @@ module ::Irc end end - H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im - PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im + HX_REGEX = /<h(\d)(?:\s+[^>]*)?>.*?<\/h\1>/im + PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|(?:div|html|body|table|td|tr)(?:\s+[^>]*)?)>/im # Try to grab and IRCify the first HTML par (<p> tag) in the given string. - # If possible, grab the one after the first h1 heading + # If possible, grab the one after the first heading # # It is possible to pass some options to determine how the stripping # occurs. Currently, only one option is supported: @@ -446,7 +446,7 @@ module ::Irc strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) - header_found = xml.match(H1_REGEX) + header_found = xml.match(HX_REGEX) if header_found header_found = $' debug "Found header: #{header_found[1].inspect}" |