diff options
-rw-r--r-- | lib/rbot/core/utils/extends.rb | 12 | ||||
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 14 |
2 files changed, 18 insertions, 8 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb index 7022fb91..0ecf7aa2 100644 --- a/lib/rbot/core/utils/extends.rb +++ b/lib/rbot/core/utils/extends.rb @@ -41,18 +41,24 @@ class ::String def ircify_html txt = self + # remove scripts + txt.gsub!(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "") + + # remove styles + txt.gsub!(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "") + # bold and strong -> bold - txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}") + txt.gsub!(/<\/?(?:b|strong)(?:\s+[^>]*)?>/im, "#{Bold}") # italic, emphasis and underline -> underline - txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}") + txt.gsub!(/<\/?(?:i|em|u)(?:\s+[^>]*)?>/im, "#{Underline}") ## This would be a nice addition, but the results are horrible ## Maybe make it configurable? # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}") # Paragraph and br tags are converted to whitespace - txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ') + txt.gsub!(/<\/?(p|br)(?:\s+[^>]*)?\s*\/?\s*>/, ' ') txt.gsub!("\n", ' ') txt.gsub!("\r", ' ') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index f2918067..e1d61039 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -440,12 +440,12 @@ module ::Irc HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im - # Some blogging and forum platforms use spans or divs with a 'body' in their class + # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class # to mark actual text - AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # At worst, we can try stuff which is comprised between two <br> - AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im # Try to grab and IRCify the first HTML par (<p> tag) in the given string. # If possible, grab the one after the first heading @@ -456,8 +456,8 @@ module ::Irc # text # * :min_spaces => Minimum number of spaces a paragraph should have # - def Utils.ircify_first_html_par(xml, opts={}) - txt = String.new + def Utils.ircify_first_html_par(xml_org, opts={}) + xml = xml_org.gsub(/<!--.*?-->/, '') strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) @@ -465,6 +465,8 @@ module ::Irc min_spaces = opts[:min_spaces] || 8 min_spaces = 0 if min_spaces < 0 + txt = String.new + while true debug "Minimum number of spaces: #{min_spaces}" header_found = xml.match(HX_REGEX) @@ -511,6 +513,8 @@ module ::Irc debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces" end + return txt unless txt.empty? or txt.count(" ") < min_spaces + # Attempt #2 header_found = xml while txt.empty? or txt.count(" ") < min_spaces |