Utils: more first par enhancements

author Giuseppe Bilotta <giuseppe.bilotta@gmail.com>

Mon, 26 Mar 2007 12:44:14 +0000 (12:44 +0000)

committer Giuseppe Bilotta <giuseppe.bilotta@gmail.com>

Mon, 26 Mar 2007 12:44:14 +0000 (12:44 +0000)
author Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Mon, 26 Mar 2007 12:44:14 +0000 (12:44 +0000)
committer Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Mon, 26 Mar 2007 12:44:14 +0000 (12:44 +0000)
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb

index 7022fb9123ac17b2d3df543e0d0f66002bbea342..0ecf7aa2bde2a5d352aa340701b03ab30c7fac88 100644 (file)
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -41,18 +41,24 @@ class ::String
    def ircify_html
      txt = self
  
+    # remove scripts
+    txt.gsub!(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "")
+
+    # remove styles
+    txt.gsub!(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+
      # bold and strong -> bold
-    txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}")
+    txt.gsub!(/<\/?(?:b|strong)(?:\s+[^>]*)?>/im, "#{Bold}")
  
      # italic, emphasis and underline -> underline
-    txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}")
+    txt.gsub!(/<\/?(?:i|em|u)(?:\s+[^>]*)?>/im, "#{Underline}")
  
      ## This would be a nice addition, but the results are horrible
      ## Maybe make it configurable?
      # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}")
  
      # Paragraph and br tags are converted to whitespace
-    txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ')
+    txt.gsub!(/<\/?(p|br)(?:\s+[^>]*)?\s*\/?\s*>/, ' ')
      txt.gsub!("\n", ' ')
      txt.gsub!("\r", ' ')
  
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index f291806700fe07443924f209c5f2343b31938aa3..e1d61039e44df665669821fb25a737b001956e7d 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -440,12 +440,12 @@ module ::Irc
      HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
      PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
  
-    # Some blogging and forum platforms use spans or divs with a 'body' in their class
+    # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
      # to mark actual text
-    AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+    AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
  
      # At worst, we can try stuff which is comprised between two <br>
-    AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+    AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
  
      # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
      # If possible, grab the one after the first heading
@@ -456,8 +456,8 @@ module ::Irc
      #               text
      #   * :min_spaces => Minimum number of spaces a paragraph should have
      #
-    def Utils.ircify_first_html_par(xml, opts={})
-      txt = String.new
+    def Utils.ircify_first_html_par(xml_org, opts={})
+      xml = xml_org.gsub(/<!--.*?-->/, '')
  
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -465,6 +465,8 @@ module ::Irc
        min_spaces = opts[:min_spaces] || 8
        min_spaces = 0 if min_spaces < 0
  
+      txt = String.new
+
        while true
          debug "Minimum number of spaces: #{min_spaces}"
          header_found = xml.match(HX_REGEX)
@@ -511,6 +513,8 @@ module ::Irc
            debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
          end
  
+        return txt unless txt.empty? or txt.count(" ") < min_spaces
+
          # Attempt #2
          header_found = xml
          while txt.empty? or txt.count(" ") < min_spaces
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
	Mon, 26 Mar 2007 12:44:14 +0000 (12:44 +0000)
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
	Mon, 26 Mar 2007 12:44:14 +0000 (12:44 +0000)
lib/rbot/core/utils/extends.rb		patch \| blob \| history
lib/rbot/core/utils/utils.rb		patch \| blob \| history