summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/rbot/core/utils/extends.rb12
-rw-r--r--lib/rbot/core/utils/utils.rb14
2 files changed, 18 insertions, 8 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index 7022fb91..0ecf7aa2 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -41,18 +41,24 @@ class ::String
def ircify_html
txt = self
+ # remove scripts
+ txt.gsub!(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "")
+
+ # remove styles
+ txt.gsub!(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+
# bold and strong -> bold
- txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}")
+ txt.gsub!(/<\/?(?:b|strong)(?:\s+[^>]*)?>/im, "#{Bold}")
# italic, emphasis and underline -> underline
- txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}")
+ txt.gsub!(/<\/?(?:i|em|u)(?:\s+[^>]*)?>/im, "#{Underline}")
## This would be a nice addition, but the results are horrible
## Maybe make it configurable?
# txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}")
# Paragraph and br tags are converted to whitespace
- txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ')
+ txt.gsub!(/<\/?(p|br)(?:\s+[^>]*)?\s*\/?\s*>/, ' ')
txt.gsub!("\n", ' ')
txt.gsub!("\r", ' ')
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index f2918067..e1d61039 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -440,12 +440,12 @@ module ::Irc
HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
- # Some blogging and forum platforms use spans or divs with a 'body' in their class
+ # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
# to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
# At worst, we can try stuff which is comprised between two <br>
- AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+ AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
# If possible, grab the one after the first heading
@@ -456,8 +456,8 @@ module ::Irc
# text
# * :min_spaces => Minimum number of spaces a paragraph should have
#
- def Utils.ircify_first_html_par(xml, opts={})
- txt = String.new
+ def Utils.ircify_first_html_par(xml_org, opts={})
+ xml = xml_org.gsub(/<!--.*?-->/, '')
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -465,6 +465,8 @@ module ::Irc
min_spaces = opts[:min_spaces] || 8
min_spaces = 0 if min_spaces < 0
+ txt = String.new
+
while true
debug "Minimum number of spaces: #{min_spaces}"
header_found = xml.match(HX_REGEX)
@@ -511,6 +513,8 @@ module ::Irc
debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
end
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
# Attempt #2
header_found = xml
while txt.empty? or txt.count(" ") < min_spaces