From c513b0227a88b441500581cff9e7f3f954830d2e Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Sun, 25 Mar 2007 18:16:36 +0000 Subject: Utils: when looking for the first par in a web page, look after any header, not just h1; also, be stricter on what's included in a paragraph --- lib/rbot/core/utils/utils.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/rbot') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index bd35d8d0..cf16b601 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -431,10 +431,10 @@ module ::Irc end end - H1_REGEX = /]*)?>(.*?)<\/h1>/im - PAR_REGEX = /]*)?>.*?<\/p>/im + HX_REGEX = /]*)?>.*?<\/h\1>/im + PAR_REGEX = /]*)?>.*?<\/?(?:p|(?:div|html|body|table|td|tr)(?:\s+[^>]*)?)>/im # Try to grab and IRCify the first HTML par (

tag) in the given string. - # If possible, grab the one after the first h1 heading + # If possible, grab the one after the first heading # # It is possible to pass some options to determine how the stripping # occurs. Currently, only one option is supported: @@ -446,7 +446,7 @@ module ::Irc strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) - header_found = xml.match(H1_REGEX) + header_found = xml.match(HX_REGEX) if header_found header_found = $' debug "Found header: #{header_found[1].inspect}" -- cgit v1.2.3