From cb9a6b2b4f3d5b79e12e97a4ef9e75190803606a Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Mon, 26 Mar 2007 12:44:14 +0000 Subject: Utils: more first par enhancements --- lib/rbot/core/utils/extends.rb | 12 +++++++++--- lib/rbot/core/utils/utils.rb | 14 +++++++++----- 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'lib/rbot') diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb index 7022fb91..0ecf7aa2 100644 --- a/lib/rbot/core/utils/extends.rb +++ b/lib/rbot/core/utils/extends.rb @@ -41,18 +41,24 @@ class ::String def ircify_html txt = self + # remove scripts + txt.gsub!(/]*)?>.*?<\/script>/im, "") + + # remove styles + txt.gsub!(/]*)?>.*?<\/style>/im, "") + # bold and strong -> bold - txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}") + txt.gsub!(/<\/?(?:b|strong)(?:\s+[^>]*)?>/im, "#{Bold}") # italic, emphasis and underline -> underline - txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}") + txt.gsub!(/<\/?(?:i|em|u)(?:\s+[^>]*)?>/im, "#{Underline}") ## This would be a nice addition, but the results are horrible ## Maybe make it configurable? # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}") # Paragraph and br tags are converted to whitespace - txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ') + txt.gsub!(/<\/?(p|br)(?:\s+[^>]*)?\s*\/?\s*>/, ' ') txt.gsub!("\n", ' ') txt.gsub!("\r", ' ') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index f2918067..e1d61039 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -440,12 +440,12 @@ module ::Irc HX_REGEX = /]*)?>(.*?)<\/h\1>/im PAR_REGEX = /]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im - # Some blogging and forum platforms use spans or divs with a 'body' in their class + # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class # to mark actual text - AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # At worst, we can try stuff which is comprised between two
- AFTER_PAR2_REGEX = /]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR2_REGEX = /]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im # Try to grab and IRCify the first HTML par (

tag) in the given string. # If possible, grab the one after the first heading @@ -456,8 +456,8 @@ module ::Irc # text # * :min_spaces => Minimum number of spaces a paragraph should have # - def Utils.ircify_first_html_par(xml, opts={}) - txt = String.new + def Utils.ircify_first_html_par(xml_org, opts={}) + xml = xml_org.gsub(//, '') strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) @@ -465,6 +465,8 @@ module ::Irc min_spaces = opts[:min_spaces] || 8 min_spaces = 0 if min_spaces < 0 + txt = String.new + while true debug "Minimum number of spaces: #{min_spaces}" header_found = xml.match(HX_REGEX) @@ -511,6 +513,8 @@ module ::Irc debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces" end + return txt unless txt.empty? or txt.count(" ") < min_spaces + # Attempt #2 header_found = xml while txt.empty? or txt.count(" ") < min_spaces -- cgit v1.2.3