X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Futils.rb;h=535ae190179c1bfbfa577a180d42baa24014f079;hb=34fa3a625fb63e927de8fd11769ced3c4d9cc83b;hp=9b978ad8079226d02f0d6e60346ccaaa890ec789;hpb=de209119bd6b098381db11d9b0de07c7d898c12e;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git

diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 9b978ad8..535ae190 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -31,272 +31,83 @@ rescue LoadError
     module ::Irc
       module Utils
         UNESCAPE_TABLE = {
-    'laquo' => '<<',
-    'raquo' => '>>',
+    'laquo' => 'Â«',
+    'raquo' => 'Â»',
     'quot' => '"',
     'apos' => '\'',
-    'micro' => 'u',
-    'copy' => '(c)',
-    'trade' => '(tm)',
-    'reg' => '(R)',
-    '#174' => '(R)',
-    '#8220' => '"',
-    '#8221' => '"',
-    '#8212' => '--',
-    '#39' => '\'',
+    'micro' => 'Âµ',
+    'copy' => 'Â©',
+    'trade' => 'â¢',
+    'reg' => 'Â®',
     'amp' => '&',
     'lt' => '<',
     'gt' => '>',
-    'hellip' => '...',
-    'nbsp' => ' ',
-=begin
-    # extras codes, for future use...
-    'zwnj' => '&#8204;',
-    'aring' => '\xe5',
-    'gt' => '>',
-    'yen' => '\xa5',
-    'ograve' => '\xf2',
-    'Chi' => '&#935;',
-    'bull' => '&#8226;',
-    'Egrave' => '\xc8',
-    'Ntilde' => '\xd1',
-    'upsih' => '&#978;',
-    'Yacute' => '\xdd',
-    'asymp' => '&#8776;',
-    'radic' => '&#8730;',
-    'otimes' => '&#8855;',
-    'nabla' => '&#8711;',
-    'aelig' => '\xe6',
-    'oelig' => '&#339;',
-    'equiv' => '&#8801;',
-    'Psi' => '&#936;',
-    'auml' => '\xe4',
-    'circ' => '&#710;',
-    'Acirc' => '\xc2',
-    'Epsilon' => '&#917;',
-    'Yuml' => '&#376;',
-    'Eta' => '&#919;',
-    'Icirc' => '\xce',
-    'Upsilon' => '&#933;',
-    'ndash' => '&#8211;',
-    'there4' => '&#8756;',
-    'Prime' => '&#8243;',
-    'prime' => '&#8242;',
-    'psi' => '&#968;',
-    'Kappa' => '&#922;',
-    'rsaquo' => '&#8250;',
-    'Tau' => '&#932;',
-    'darr' => '&#8595;',
-    'ocirc' => '\xf4',
-    'lrm' => '&#8206;',
-    'zwj' => '&#8205;',
-    'cedil' => '\xb8',
-    'Ecirc' => '\xca',
-    'not' => '\xac',
-    'AElig' => '\xc6',
-    'oslash' => '\xf8',
-    'acute' => '\xb4',
-    'lceil' => '&#8968;',
-    'shy' => '\xad',
-    'rdquo' => '&#8221;',
-    'ge' => '&#8805;',
-    'Igrave' => '\xcc',
-    'Ograve' => '\xd2',
-    'euro' => '&#8364;',
-    'dArr' => '&#8659;',
-    'sdot' => '&#8901;',
-    'nbsp' => '\xa0',
-    'lfloor' => '&#8970;',
-    'lArr' => '&#8656;',
-    'Auml' => '\xc4',
-    'larr' => '&#8592;',
-    'Atilde' => '\xc3',
-    'Otilde' => '\xd5',
-    'szlig' => '\xdf',
-    'clubs' => '&#9827;',
-    'diams' => '&#9830;',
-    'agrave' => '\xe0',
-    'Ocirc' => '\xd4',
-    'Iota' => '&#921;',
-    'Theta' => '&#920;',
-    'Pi' => '&#928;',
-    'OElig' => '&#338;',
-    'Scaron' => '&#352;',
-    'frac14' => '\xbc',
-    'egrave' => '\xe8',
-    'sub' => '&#8834;',
-    'iexcl' => '\xa1',
-    'frac12' => '\xbd',
-    'sbquo' => '&#8218;',
-    'ordf' => '\xaa',
-    'sum' => '&#8721;',
-    'prop' => '&#8733;',
-    'Uuml' => '\xdc',
-    'ntilde' => '\xf1',
-    'sup' => '&#8835;',
-    'theta' => '&#952;',
-    'prod' => '&#8719;',
-    'nsub' => '&#8836;',
-    'hArr' => '&#8660;',
-    'rlm' => '&#8207;',
-    'THORN' => '\xde',
-    'infin' => '&#8734;',
-    'yuml' => '\xff',
-    'Mu' => '&#924;',
-    'le' => '&#8804;',
-    'Eacute' => '\xc9',
-    'thinsp' => '&#8201;',
-    'ecirc' => '\xea',
-    'bdquo' => '&#8222;',
-    'Sigma' => '&#931;',
-    'fnof' => '&#402;',
-    'Aring' => '\xc5',
-    'tilde' => '&#732;',
-    'frac34' => '\xbe',
-    'emsp' => '&#8195;',
-    'mdash' => '&#8212;',
-    'uarr' => '&#8593;',
-    'permil' => '&#8240;',
-    'Ugrave' => '\xd9',
-    'rarr' => '&#8594;',
-    'Agrave' => '\xc0',
-    'chi' => '&#967;',
-    'forall' => '&#8704;',
-    'eth' => '\xf0',
-    'rceil' => '&#8969;',
-    'iuml' => '\xef',
-    'gamma' => '&#947;',
-    'lambda' => '&#955;',
-    'harr' => '&#8596;',
-    'rang' => '&#9002;',
-    'xi' => '&#958;',
-    'dagger' => '&#8224;',
-    'divide' => '\xf7',
-    'Ouml' => '\xd6',
-    'image' => '&#8465;',
-    'alefsym' => '&#8501;',
-    'igrave' => '\xec',
-    'otilde' => '\xf5',
-    'Oacute' => '\xd3',
-    'sube' => '&#8838;',
-    'alpha' => '&#945;',
-    'frasl' => '&#8260;',
-    'ETH' => '\xd0',
-    'lowast' => '&#8727;',
-    'Nu' => '&#925;',
-    'plusmn' => '\xb1',
-    'Euml' => '\xcb',
-    'real' => '&#8476;',
-    'sup1' => '\xb9',
-    'sup2' => '\xb2',
-    'sup3' => '\xb3',
-    'Oslash' => '\xd8',
-    'Aacute' => '\xc1',
-    'cent' => '\xa2',
-    'oline' => '&#8254;',
-    'Beta' => '&#914;',
-    'perp' => '&#8869;',
-    'Delta' => '&#916;',
-    'loz' => '&#9674;',
-    'pi' => '&#960;',
-    'iota' => '&#953;',
-    'empty' => '&#8709;',
-    'euml' => '\xeb',
-    'brvbar' => '\xa6',
-    'iacute' => '\xed',
-    'para' => '\xb6',
-    'micro' => '\xb5',
-    'cup' => '&#8746;',
-    'weierp' => '&#8472;',
-    'uuml' => '\xfc',
-    'part' => '&#8706;',
-    'icirc' => '\xee',
-    'delta' => '&#948;',
-    'omicron' => '&#959;',
-    'upsilon' => '&#965;',
-    'Iuml' => '\xcf',
-    'Lambda' => '&#923;',
-    'Xi' => '&#926;',
-    'kappa' => '&#954;',
-    'ccedil' => '\xe7',
-    'Ucirc' => '\xdb',
-    'cap' => '&#8745;',
-    'mu' => '&#956;',
-    'scaron' => '&#353;',
-    'lsquo' => '&#8216;',
-    'isin' => '&#8712;',
-    'Zeta' => '&#918;',
-    'supe' => '&#8839;',
-    'deg' => '\xb0',
-    'and' => '&#8743;',
-    'tau' => '&#964;',
-    'pound' => '\xa3',
-    'hellip' => '&#8230;',
-    'curren' => '\xa4',
-    'int' => '&#8747;',
-    'ucirc' => '\xfb',
-    'rfloor' => '&#8971;',
-    'ensp' => '&#8194;',
-    'crarr' => '&#8629;',
-    'ugrave' => '\xf9',
-    'notin' => '&#8713;',
-    'exist' => '&#8707;',
-    'uArr' => '&#8657;',
-    'cong' => '&#8773;',
-    'Dagger' => '&#8225;',
-    'oplus' => '&#8853;',
-    'times' => '\xd7',
-    'atilde' => '\xe3',
-    'piv' => '&#982;',
-    'ni' => '&#8715;',
-    'Phi' => '&#934;',
-    'lsaquo' => '&#8249;',
-    'Uacute' => '\xda',
-    'Omicron' => '&#927;',
-    'ang' => '&#8736;',
-    'ne' => '&#8800;',
-    'iquest' => '\xbf',
-    'eta' => '&#951;',
-    'yacute' => '\xfd',
-    'Rho' => '&#929;',
-    'uacute' => '\xfa',
-    'Alpha' => '&#913;',
-    'zeta' => '&#950;',
-    'Omega' => '&#937;',
-    'nu' => '&#957;',
-    'sim' => '&#8764;',
-    'sect' => '\xa7',
-    'phi' => '&#966;',
-    'sigmaf' => '&#962;',
-    'macr' => '\xaf',
-    'minus' => '&#8722;',
-    'Ccedil' => '\xc7',
-    'ordm' => '\xba',
-    'epsilon' => '&#949;',
-    'beta' => '&#946;',
-    'rArr' => '&#8658;',
-    'rho' => '&#961;',
-    'aacute' => '\xe1',
-    'eacute' => '\xe9',
-    'omega' => '&#969;',
-    'middot' => '\xb7',
-    'Gamma' => '&#915;',
-    'Iacute' => '\xcd',
-    'lang' => '&#9001;',
-    'spades' => '&#9824;',
-    'rsquo' => '&#8217;',
-    'uml' => '\xa8',
-    'thorn' => '\xfe',
-    'ouml' => '\xf6',
-    'thetasym' => '&#977;',
-    'or' => '&#8744;',
-    'raquo' => '\xbb',
-    'acirc' => '\xe2',
-    'ldquo' => '&#8220;',
-    'hearts' => '&#9829;',
-    'sigma' => '&#963;',
-    'oacute' => '\xf3',
-=end
+    'hellip' => 'â¦',
+    'nbsp' => 'Â ',
+    'Agrave' => 'Ã',
+    'Aacute' => 'Ã',
+    'Acirc' => 'Ã',
+    'Atilde' => 'Ã',
+    'Auml' => 'Ã',
+    'Aring' => 'Ã',
+    'AElig' => 'Ã',
+    'OElig' => 'Å',
+    'Ccedil' => 'Ã',
+    'Egrave' => 'Ã',
+    'Eacute' => 'Ã',
+    'Ecirc' => 'Ã',
+    'Euml' => 'Ã',
+    'Igrave' => 'Ã',
+    'Iacute' => 'Ã',
+    'Icirc' => 'Ã',
+    'Iuml' => 'Ã',
+    'ETH' => 'Ã',
+    'Ntilde' => 'Ã',
+    'Ograve' => 'Ã',
+    'Oacute' => 'Ã',
+    'Ocirc' => 'Ã',
+    'Otilde' => 'Ã',
+    'Ouml' => 'Ã',
+    'Oslash' => 'Ã',
+    'Ugrave' => 'Ã',
+    'Uacute' => 'Ã',
+    'Ucirc' => 'Ã',
+    'Uuml' => 'Ã',
+    'Yacute' => 'Ã',
+    'THORN' => 'Ã',
+    'szlig' => 'Ã',
+    'agrave' => 'Ã ',
+    'aacute' => 'Ã¡',
+    'acirc' => 'Ã¢',
+    'atilde' => 'Ã£',
+    'auml' => 'Ã¤',
+    'aring' => 'Ã¥',
+    'aelig' => 'Ã¦',
+    'oelig' => 'Å',
+    'ccedil' => 'Ã§',
+    'egrave' => 'Ã¨',
+    'eacute' => 'Ã©',
+    'ecirc' => 'Ãª',
+    'euml' => 'Ã«',
+    'igrave' => 'Ã¬',
+    'iacute' => 'Ã­',
+    'icirc' => 'Ã®',
+    'iuml' => 'Ã¯',
+    'eth' => 'Ã°',
+    'ntilde' => 'Ã±',
+    'ograve' => 'Ã²',
+    'oacute' => 'Ã³',
+    'ocirc' => 'Ã´',
+    'otilde' => 'Ãµ',
+    'ouml' => 'Ã¶',
+    'oslash' => 'Ã¸',
+    'ugrave' => 'Ã¹',
+    'uacute' => 'Ãº',
+    'ucirc' => 'Ã»',
+    'uuml' => 'Ã¼',
+    'yacute' => 'Ã½',
+    'thorn' => 'Ã¾',
+    'yuml' => 'Ã¿'
         }
       end
     end
@@ -304,7 +115,14 @@ rescue LoadError
 end
 
 begin
-  require 'htmlentities'
+  require 'hpricot'
+  module ::Irc
+    module Utils
+      AFTER_PAR_PATH = /^(?:div|span)$/
+      AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/
+      AFTER_PAR_CLASS = /body|message|text/i
+    end
+  end
 rescue LoadError
   gems = nil
   begin
@@ -317,7 +135,10 @@ rescue LoadError
   else
     module ::Irc
       module Utils
-        # define some regular expressions to be used for first_html_par
+        # Some regular expressions to manage HTML data
+
+        # Title
+        TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
 
         # H1, H2, etc
         HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
@@ -337,15 +158,17 @@ end
 
 module ::Irc
 
-  # miscellaneous useful functions
+  # Miscellaneous useful functions
   module Utils
     @@bot = nil unless defined? @@bot
     @@safe_save_dir = nil unless defined?(@@safe_save_dir)
 
+    # The bot instance
     def Utils.bot
       @@bot
     end
 
+    # Set up some Utils routines which depend on the associated bot.
     def Utils.bot=(b)
       debug "initializing utils"
       @@bot = b
@@ -353,12 +176,18 @@ module ::Irc
     end
 
 
+    # Seconds per minute
     SEC_PER_MIN = 60
+    # Seconds per hour
     SEC_PER_HR = SEC_PER_MIN * 60
+    # Seconds per day
     SEC_PER_DAY = SEC_PER_HR * 24
+    # Seconds per (30-day) month
     SEC_PER_MNTH = SEC_PER_DAY * 30
+    # Second per (30*12 = 360 day) year
     SEC_PER_YR = SEC_PER_MNTH * 12
 
+    # Auxiliary method needed by Utils.secs_to_string
     def Utils.secs_to_string_case(array, var, string, plural)
       case var
       when 1
@@ -368,8 +197,8 @@ module ::Irc
       end
     end
 
-    # turn a number of seconds into a human readable string, e.g
-    # 2 days, 3 hours, 18 minutes, 10 seconds
+    # Turn a number of seconds into a human readable string, e.g
+    # 2 days, 3 hours, 18 minutes and 10 seconds
     def Utils.secs_to_string(secs)
       ret = []
       years, secs = secs.divmod SEC_PER_YR
@@ -395,9 +224,12 @@ module ::Irc
     end
 
 
+    # Execute an external program, returning a String obtained by redirecting
+    # the program's standards errors and output 
+    #
     def Utils.safe_exec(command, *args)
-      IO.popen("-") {|p|
-        if(p)
+      IO.popen("-") { |p|
+        if p
           return p.readlines.join("\n")
         else
           begin
@@ -414,6 +246,11 @@ module ::Irc
     end
 
 
+    # Safely (atomically) save to _file_, by passing a tempfile to the block
+    # and then moving the tempfile to its final location when done.
+    #
+    # call-seq: Utils.safe_save(file, &block)
+    #
     def Utils.safe_save(file)
       raise 'No safe save directory defined!' if @@safe_save_dir.nil?
       basename = File.basename(file)
@@ -425,6 +262,9 @@ module ::Irc
     end
 
 
+    # Decode HTML entities in the String _str_, using HTMLEntities if the
+    # package was found, or UNESCAPE_TABLE otherwise.
+    #
     def Utils.decode_html_entities(str)
       if defined? ::HTMLEntities
         return HTMLEntities.decode_entities(str)
@@ -437,7 +277,7 @@ module ::Irc
           end
 
           # output the symbol's irc-translated character, or a * if it's unknown
-          UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
+          UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [$0.to_i].pack("U") : '*')
         }
       end
     end
@@ -447,9 +287,9 @@ module ::Irc
     #
     # It is possible to pass some options to determine how the stripping
     # occurs. Currently supported options are
-    #   * :strip => Regex or String to strip at the beginning of the obtained
-    #               text
-    #   * :min_spaces => Minimum number of spaces a paragraph should have
+    # strip:: Regex or String to strip at the beginning of the obtained
+    #         text
+    # min_spaces:: minimum number of spaces a paragraph should have
     #
     def Utils.ircify_first_html_par(xml_org, opts={})
       if defined? ::Hpricot
@@ -459,14 +299,14 @@ module ::Irc
       end
     end
 
-    # with hpricot
+    # HTML first par grabber using hpricot
     def Utils.ircify_first_html_par_wh(xml_org, opts={})
       doc = Hpricot(xml_org)
 
       # Strip styles and scripts
       (doc/"style|script").remove
 
-      debug doc.inspect
+      debug doc
 
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -476,23 +316,27 @@ module ::Irc
 
       txt = String.new
 
-      h = %w{h1 h2 h3 h4 h5 h6}
-      p = %w{p}
-      ar = []
-      h.each { |hx|
-        p.each { |px|
-          ar << "#{hx}~#{px}"
-        }
-      }
-      h_p_css = ar.join("|")
-      debug "css search: #{h_p_css}"
+      pre_h = pars = by_span = nil
 
       while true
         debug "Minimum number of spaces: #{min_spaces}"
 
         # Initial attempt: <p> that follows <h\d>
-        pre_h = doc/h_p_css
-        debug "Hx: found: #{pre_h.pretty_inspect}"
+        if pre_h.nil?
+          pre_h = Hpricot::Elements[]
+          found_h = false
+          doc.search("*") { |e|
+            next if e.bogusetag?
+            case e.pathname
+            when /^h\d/
+              found_h = true
+            when 'p'
+              pre_h << e if found_h
+            end
+          }
+          debug "Hx: found: #{pre_h.pretty_inspect}"
+        end
+
         pre_h.each { |p|
           debug p
           txt = p.to_html.ircify_html
@@ -504,7 +348,7 @@ module ::Irc
         return txt unless txt.empty? or txt.count(" ") < min_spaces
 
         # Second natural attempt: just get any <p>
-        pars = doc/"p"
+        pars = doc/"p" if pars.nil?
         debug "par: found: #{pars.pretty_inspect}"
         pars.each { |p|
           debug p
@@ -524,15 +368,27 @@ module ::Irc
         # 'message' or 'text' in their class to mark actual text. Since we want
         # the class match to be partial and case insensitive, we collect
         # the common elements that may have this class and then filter out those
-        # we don't need
-        pars = Hpricot::Elements[]
-        pre_pars = doc/"div|span|td|tr|tbody|table"
-        pre_pars.each { |el|
-          pars.push el if el.class =~ /body|message|text/i
-        }
-        debug "other \#1: found: #{pars.pretty_inspect}"
+        # we don't need. If no divs or spans are found, we'll accept additional
+        # elements too (td, tr, tbody, table).
+        if by_span.nil?
+          by_span = Hpricot::Elements[]
+          extra = Hpricot::Elements[]
+          doc.search("*") { |el|
+            next if el.bogusetag?
+            case el.pathname
+            when AFTER_PAR_PATH
+              by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+            when AFTER_PAR_EX
+              extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+            end
+          }
+          if by_span.empty? and not extra.empty?
+            by_span.concat extra
+          end
+          debug "other \#1: found: #{by_span.pretty_inspect}"
+        end
 
-        pars.each { |p|
+        by_span.each { |p|
           debug p
           txt = p.to_html.ircify_html
           txt.sub!(strip, '') if strip
@@ -552,7 +408,7 @@ module ::Irc
       end
     end
 
-    # without hpricot
+    # HTML first par grabber without hpricot
     def Utils.ircify_first_html_par_woh(xml_org, opts={})
       xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
 
@@ -630,6 +486,94 @@ module ::Irc
       end
     end
 
+    # This method extracts title, content (first par) and extra
+    # information from the given document _doc_.
+    #
+    # _doc_ can be an URI, a Net::HTTPResponse or a String.
+    #
+    # If _doc_ is a String, only title and content information
+    # are retrieved (if possible), using standard methods.
+    #
+    # If _doc_ is an URI or a Net::HTTPResponse, additional
+    # information is retrieved, and special title/summary
+    # extraction routines are used if possible.
+    #
+    def Utils.get_html_info(doc, opts={})
+      case doc
+      when String
+        Utils.get_string_html_info(doc, opts)
+      when Net::HTTPResponse
+        Utils.get_resp_html_info(doc, opts)
+      when URI
+        ret = Hash.new
+        @@bot.httputil.get_response(doc) { |resp|
+          ret = Utils.get_resp_html_info(resp, opts)
+        }
+        return ret
+      else
+        raise
+      end
+    end
+
+    class ::UrlLinkError < RuntimeError
+    end
+
+    # This method extracts title, content (first par) and extra
+    # information from the given Net::HTTPResponse _resp_.
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    # Returns a Hash with the following keys:
+    # title:: the title of the document (if any)
+    # content:: the first paragraph of the document (if any)
+    # headers::
+    #   the headers of the Net::HTTPResponse. The value is
+    #   a Hash whose keys are lowercase forms of the HTTP
+    #   header fields, and whose values are Arrays.
+    #
+    def Utils.get_resp_html_info(resp, opts={})
+      ret = Hash.new
+      case resp
+      when Net::HTTPSuccess
+        ret[:headers] = resp.to_hash
+
+        partial = resp.partial_body(@@bot.config['http.info_bytes'])
+        if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+          loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
+          if loc and loc.fragment and not loc.fragment.empty?
+            opts[:uri_fragment] ||= loc.fragment
+          end
+          ret.merge!(Utils.get_string_html_info(partial, opts))
+        end
+        return ret
+      else
+        raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+      end
+    end
+
+    # This method extracts title and content (first par)
+    # from the given HTML or XML document _text_, using
+    # standard methods (String#ircify_html_title,
+    # Utils.ircify_first_html_par)
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    def Utils.get_string_html_info(text, opts={})
+      txt = text.dup
+      title = txt.ircify_html_title
+      if frag = opts[:uri_fragment] and not frag.empty?
+        fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
+        txt.sub!(fragreg,'')
+      end
+      c_opts = opts.dup
+      c_opts[:strip] ||= title
+      content = Utils.ircify_first_html_par(txt, c_opts)
+      content = nil if content.empty?
+      return {:title => title, :content => content}
+    end
+
     # Get the first pars of the first _count_ _urls_.
     # The pages are downloaded using the bot httputil service.
     # Returns an array of the first paragraphs fetched.
@@ -644,27 +588,20 @@ module ::Irc
         url = urls.shift
         idx += 1
 
-        # FIXME what happens if some big file is returned? We should share
-        # code with the url plugin to only retrieve partial file content!
-        xml = self.bot.httputil.get(url)
-        if xml.nil?
-          debug "Unable to retrieve #{url}"
-          next
-        end
-        par = Utils.ircify_first_html_par(xml, opts)
-        if par.empty?
-          debug "No first par found\n#{xml}"
-          # FIXME only do this if the 'url' plugin is loaded
-          # TODO even better, put the code here
-          # par = @bot.plugins['url'].get_title_from_html(xml)
-          if par.empty?
-            retval.push(nil)
-            next
+        begin
+          info = Utils.get_html_info(URI.parse(url), opts)
+
+          par = info[:content]
+          retval.push(par)
+
+          if par
+            msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+            count -=1
           end
+        rescue
+          debug "Unable to retrieve #{url}: #{$!}"
+          next
         end
-        msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
-        count -=1
-        retval.push(par)
       end
       return retval
     end
@@ -672,4 +609,4 @@ module ::Irc
   end
 end
 
-Irc::Utils.bot = Irc::Plugins.manager.bot
+Irc::Utils.bot = Irc::Bot::Plugins.manager.bot