diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-02-04 22:57:46 +0000 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-02-04 22:57:46 +0000 |
commit | 107d621a9f8b959e9f8047256d12a22360bcc6e0 (patch) | |
tree | 1f01c6df9d6f68f42725c149777c45da3c1a72fd /data | |
parent | b1e2896b32af5971b19feac720956ba9d1395938 (diff) |
Improve 'first paragraph' detection in search plugin, and clean up ircify_html method
Diffstat (limited to 'data')
-rw-r--r-- | data/rbot/plugins/search.rb | 49 |
1 files changed, 39 insertions, 10 deletions
diff --git a/data/rbot/plugins/search.rb b/data/rbot/plugins/search.rb index a035b831..f5bab421 100644 --- a/data/rbot/plugins/search.rb +++ b/data/rbot/plugins/search.rb @@ -15,15 +15,32 @@ class ::String def ircify_html txt = self - txt.gsub!(/<\/?b\s*>/, "#{Bold}") - txt.gsub!(/<\/?i\s*>/, "#{Underline}") + + # bold and strong -> bold + txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}") + + # italic, emphasis and underline -> underline + txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}") + ## This would be a nice addition, but the results are horrible ## Maybe make it configurable? # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}") - txt.gsub!(/<\/?(p|br)>/, ' ') + + # Paragraph and br tags are converted to whitespace. + txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ') txt.gsub!("\n", ' ') + + # All other tags are just removed txt.gsub!(/<[^>]+>/, '') + + # Remove double formatting options, since they only waste bytes + txt.gsub!(/#{Bold}\s*#{Bold}/,"") + txt.gsub!(/#{Underline}\s*#{Underline}/,"") + + # And finally whitespace is squeezed txt.gsub!(/\s+/, ' ') + + # Decode entities and strip whitespace return Utils.decode_html_entities(txt).strip! end end @@ -104,22 +121,34 @@ class SearchPlugin < Plugin next end # We get the first par after the first main heading, if possible - header_found = xml.match(/<h1( [^>]*)?>.*?<\/h1>/im) - txt = nil + header_found = xml.match(/<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im) + txt = String.new if header_found - txt = header_found.post_match[/<p( [^>]*)?>.*?<\/p>/im] + debug "Found header: #{header_found[1].inspect}" + while txt.empty? + header_found = $' + candidate = header_found[/<p(?:\s+[^>]*)?>.*?<\/p>/im].ircify_html + break unless candidate + txt.replace candidate + end end # If we haven't found a first par yet, try to get it from the whole # document - unless txt - txt = xml[/<p( [^>]*)?>.*?<\/p>/im] + if txt.empty? + txt = xml[/<p(?:\s+[^>]*)?>.*?<\/p>/im].ircify_html + while txt.empty? + header_found = $' + candidate = header_found[/<p(?:\s+[^>]*)?>.*?<\/p>/im].ircify_html + break unless candidate + txt.replace candidate + end end # Nothing yet, give up - unless txt + if txt.empty? debug "No first par found\n#{xml}" next end - m.reply "[#{idx}] #{txt.ircify_html}".omissis_after(400) + m.reply "[#{idx}] #{txt}".omissis_after(400) first_pars -=1 end end |