From e36c0f2c5de9f951cdf37e8d7233a2c6ead197a6 Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Sun, 25 Mar 2007 21:23:11 +0000 Subject: Utils: retry after requiring rubygems if htmlentities failed to load; when grabbing first pars, try filtering out too short paragraphs --- lib/rbot/core/utils/utils.rb | 98 ++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 41 deletions(-) (limited to 'lib/rbot') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 08396107..047b29d6 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -17,12 +17,16 @@ require 'uri' require 'tempfile' begin - $we_have_html_entities_decoder = require 'htmlentities' + require 'htmlentities' + $we_have_html_entities_decoder = true rescue LoadError - $we_have_html_entities_decoder = false - module ::Irc - module Utils - UNESCAPE_TABLE = { + if require 'rubygems' rescue false + retry + else + $we_have_html_entities_decoder = false + module ::Irc + module Utils + UNESCAPE_TABLE = { 'laquo' => '<<', 'raquo' => '>>', 'quot' => '"', @@ -289,7 +293,8 @@ rescue LoadError 'sigma' => 'σ', 'oacute' => '\xf3', =end - } + } + end end end end @@ -431,7 +436,7 @@ module ::Irc end end - HX_REGEX = /]*)?>.*?<\/h\1>/im + HX_REGEX = /]*)?>(.*?)<\/h\1>/im PAR_REGEX = /]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # Some blogging and forum platforms use spans or divs with a 'body' in their class @@ -442,9 +447,10 @@ module ::Irc # If possible, grab the one after the first heading # # It is possible to pass some options to determine how the stripping - # occurs. Currently, only one option is supported: + # occurs. Currently supported options are # * :strip => Regex or String to strip at the beginning of the obtained # text + # * :min_spaces => Minimum number of spaces a paragraph should have # def Utils.ircify_first_html_par(xml, opts={}) txt = String.new @@ -452,47 +458,57 @@ module ::Irc strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) - header_found = xml.match(HX_REGEX) - if header_found - header_found = $' - debug "Found header: #{header_found[1].inspect}" - while txt.empty? + min_spaces = opts[:min_spaces] || 8 + min_spaces = 0 if min_spaces < 0 + + while true + debug "Minimum number of spaces: #{min_spaces}" + header_found = xml.match(HX_REGEX) + if header_found + header_found = $' + while txt.empty? or txt.count(" ") < min_spaces + candidate = header_found[PAR_REGEX] + break unless candidate + txt = candidate.ircify_html + header_found = $' + txt.sub!(strip, '') if strip + debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces" + end + end + + return txt unless txt.empty? or txt.count(" ") < min_spaces + + # If we haven't found a first par yet, try to get it from the whole + # document + header_found = xml + while txt.empty? or txt.count(" ") < min_spaces candidate = header_found[PAR_REGEX] break unless candidate txt = candidate.ircify_html header_found = $' - txt.sub!(strip, '') if strip + txt.sub!(strip, '') if strip + debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces" end - end - return txt unless txt.empty? - - # If we haven't found a first par yet, try to get it from the whole - # document - header_found = xml - while txt.empty? - candidate = header_found[PAR_REGEX] - break unless candidate - txt = candidate.ircify_html - header_found = $' - txt.sub!(strip, '') if strip - end + return txt unless txt.empty? or txt.count(" ") < min_spaces - return txt unless txt.empty? - - # Nothing yet ... let's get drastic: we look for non-par elements too, - # but only for those that match something that we know is likely to - # contain text - header_found = xml - while txt.empty? - candidate = header_found[AFTER_PAR1_REGEX] - break unless candidate - txt = candidate.ircify_html - header_found = $' - txt.sub!(strip, '') if strip - end + # Nothing yet ... let's get drastic: we look for non-par elements too, + # but only for those that match something that we know is likely to + # contain text + header_found = xml + while txt.empty? or txt.count(" ") < min_spaces + candidate = header_found[AFTER_PAR1_REGEX] + break unless candidate + txt = candidate.ircify_html + header_found = $' + txt.sub!(strip, '') if strip + debug "(other attempt) #{txt.inspect} has #{txt.count(" ")} spaces" + end - return txt + debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces" + return txt unless txt.count(" ") < min_spaces + min_spaces /= 2 + end end # Get the first pars of the first _count_ _urls_. -- cgit v1.2.3