utils: support hex HTML entities

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index 0b10b52fd1ab3efe5dfba6b86187fe989fa86e77..7b316ffe28cd3df6ffc6ecd0d1707063142dd1bb 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -6,9 +6,6 @@
  # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
  # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
  #
-# Copyright:: (C) 2002-2006 Tom Gilbert
-# Copyright:: (C) 2007 Giuseppe Bilotta
-#
  # TODO some of these Utils should be rewritten as extensions to the approriate
  # standard Ruby classes and accordingly be moved to extends.rb
  
@@ -19,302 +16,103 @@ require 'set'
  begin
    require 'htmlentities'
  rescue LoadError
-  gems = nil
-  begin
-    gems = require 'rubygems'
-  rescue LoadError
-    gems = false
-  end
-  if gems
-    retry
-  else
      module ::Irc
        module Utils
          UNESCAPE_TABLE = {
-    'laquo' => '<<',
-    'raquo' => '>>',
+    'laquo' => '«',
+    'raquo' => '»',
      'quot' => '"',
      'apos' => '\'',
-    'micro' => 'u',
-    'copy' => '(c)',
-    'trade' => '(tm)',
-    'reg' => '(R)',
-    '#174' => '(R)',
-    '#8220' => '"',
-    '#8221' => '"',
-    '#8212' => '--',
-    '#39' => '\'',
+    'deg' => '°',
+    'micro' => 'µ',
+    'copy' => '©',
+    'trade' => '™',
+    'reg' => '®',
      'amp' => '&',
      'lt' => '<',
      'gt' => '>',
-    'hellip' => '...',
-    'nbsp' => ' ',
-=begin
-    # extras codes, for future use...
-    'zwnj' => '&#8204;',
-    'aring' => '\xe5',
-    'gt' => '>',
-    'yen' => '\xa5',
-    'ograve' => '\xf2',
-    'Chi' => '&#935;',
-    'bull' => '&#8226;',
-    'Egrave' => '\xc8',
-    'Ntilde' => '\xd1',
-    'upsih' => '&#978;',
-    'Yacute' => '\xdd',
-    'asymp' => '&#8776;',
-    'radic' => '&#8730;',
-    'otimes' => '&#8855;',
-    'nabla' => '&#8711;',
-    'aelig' => '\xe6',
-    'oelig' => '&#339;',
-    'equiv' => '&#8801;',
-    'Psi' => '&#936;',
-    'auml' => '\xe4',
-    'circ' => '&#710;',
-    'Acirc' => '\xc2',
-    'Epsilon' => '&#917;',
-    'Yuml' => '&#376;',
-    'Eta' => '&#919;',
-    'Icirc' => '\xce',
-    'Upsilon' => '&#933;',
-    'ndash' => '&#8211;',
-    'there4' => '&#8756;',
-    'Prime' => '&#8243;',
-    'prime' => '&#8242;',
-    'psi' => '&#968;',
-    'Kappa' => '&#922;',
-    'rsaquo' => '&#8250;',
-    'Tau' => '&#932;',
-    'darr' => '&#8595;',
-    'ocirc' => '\xf4',
-    'lrm' => '&#8206;',
-    'zwj' => '&#8205;',
-    'cedil' => '\xb8',
-    'Ecirc' => '\xca',
-    'not' => '\xac',
-    'AElig' => '\xc6',
-    'oslash' => '\xf8',
-    'acute' => '\xb4',
-    'lceil' => '&#8968;',
-    'shy' => '\xad',
-    'rdquo' => '&#8221;',
-    'ge' => '&#8805;',
-    'Igrave' => '\xcc',
-    'Ograve' => '\xd2',
-    'euro' => '&#8364;',
-    'dArr' => '&#8659;',
-    'sdot' => '&#8901;',
-    'nbsp' => '\xa0',
-    'lfloor' => '&#8970;',
-    'lArr' => '&#8656;',
-    'Auml' => '\xc4',
-    'larr' => '&#8592;',
-    'Atilde' => '\xc3',
-    'Otilde' => '\xd5',
-    'szlig' => '\xdf',
-    'clubs' => '&#9827;',
-    'diams' => '&#9830;',
-    'agrave' => '\xe0',
-    'Ocirc' => '\xd4',
-    'Iota' => '&#921;',
-    'Theta' => '&#920;',
-    'Pi' => '&#928;',
-    'OElig' => '&#338;',
-    'Scaron' => '&#352;',
-    'frac14' => '\xbc',
-    'egrave' => '\xe8',
-    'sub' => '&#8834;',
-    'iexcl' => '\xa1',
-    'frac12' => '\xbd',
-    'sbquo' => '&#8218;',
-    'ordf' => '\xaa',
-    'sum' => '&#8721;',
-    'prop' => '&#8733;',
-    'Uuml' => '\xdc',
-    'ntilde' => '\xf1',
-    'sup' => '&#8835;',
-    'theta' => '&#952;',
-    'prod' => '&#8719;',
-    'nsub' => '&#8836;',
-    'hArr' => '&#8660;',
-    'rlm' => '&#8207;',
-    'THORN' => '\xde',
-    'infin' => '&#8734;',
-    'yuml' => '\xff',
-    'Mu' => '&#924;',
-    'le' => '&#8804;',
-    'Eacute' => '\xc9',
-    'thinsp' => '&#8201;',
-    'ecirc' => '\xea',
-    'bdquo' => '&#8222;',
-    'Sigma' => '&#931;',
-    'fnof' => '&#402;',
-    'Aring' => '\xc5',
-    'tilde' => '&#732;',
-    'frac34' => '\xbe',
-    'emsp' => '&#8195;',
-    'mdash' => '&#8212;',
-    'uarr' => '&#8593;',
-    'permil' => '&#8240;',
-    'Ugrave' => '\xd9',
-    'rarr' => '&#8594;',
-    'Agrave' => '\xc0',
-    'chi' => '&#967;',
-    'forall' => '&#8704;',
-    'eth' => '\xf0',
-    'rceil' => '&#8969;',
-    'iuml' => '\xef',
-    'gamma' => '&#947;',
-    'lambda' => '&#955;',
-    'harr' => '&#8596;',
-    'rang' => '&#9002;',
-    'xi' => '&#958;',
-    'dagger' => '&#8224;',
-    'divide' => '\xf7',
-    'Ouml' => '\xd6',
-    'image' => '&#8465;',
-    'alefsym' => '&#8501;',
-    'igrave' => '\xec',
-    'otilde' => '\xf5',
-    'Oacute' => '\xd3',
-    'sube' => '&#8838;',
-    'alpha' => '&#945;',
-    'frasl' => '&#8260;',
-    'ETH' => '\xd0',
-    'lowast' => '&#8727;',
-    'Nu' => '&#925;',
-    'plusmn' => '\xb1',
-    'Euml' => '\xcb',
-    'real' => '&#8476;',
-    'sup1' => '\xb9',
-    'sup2' => '\xb2',
-    'sup3' => '\xb3',
-    'Oslash' => '\xd8',
-    'Aacute' => '\xc1',
-    'cent' => '\xa2',
-    'oline' => '&#8254;',
-    'Beta' => '&#914;',
-    'perp' => '&#8869;',
-    'Delta' => '&#916;',
-    'loz' => '&#9674;',
-    'pi' => '&#960;',
-    'iota' => '&#953;',
-    'empty' => '&#8709;',
-    'euml' => '\xeb',
-    'brvbar' => '\xa6',
-    'iacute' => '\xed',
-    'para' => '\xb6',
-    'micro' => '\xb5',
-    'cup' => '&#8746;',
-    'weierp' => '&#8472;',
-    'uuml' => '\xfc',
-    'part' => '&#8706;',
-    'icirc' => '\xee',
-    'delta' => '&#948;',
-    'omicron' => '&#959;',
-    'upsilon' => '&#965;',
-    'Iuml' => '\xcf',
-    'Lambda' => '&#923;',
-    'Xi' => '&#926;',
-    'kappa' => '&#954;',
-    'ccedil' => '\xe7',
-    'Ucirc' => '\xdb',
-    'cap' => '&#8745;',
-    'mu' => '&#956;',
-    'scaron' => '&#353;',
-    'lsquo' => '&#8216;',
-    'isin' => '&#8712;',
-    'Zeta' => '&#918;',
-    'supe' => '&#8839;',
-    'deg' => '\xb0',
-    'and' => '&#8743;',
-    'tau' => '&#964;',
-    'pound' => '\xa3',
-    'hellip' => '&#8230;',
-    'curren' => '\xa4',
-    'int' => '&#8747;',
-    'ucirc' => '\xfb',
-    'rfloor' => '&#8971;',
-    'ensp' => '&#8194;',
-    'crarr' => '&#8629;',
-    'ugrave' => '\xf9',
-    'notin' => '&#8713;',
-    'exist' => '&#8707;',
-    'uArr' => '&#8657;',
-    'cong' => '&#8773;',
-    'Dagger' => '&#8225;',
-    'oplus' => '&#8853;',
-    'times' => '\xd7',
-    'atilde' => '\xe3',
-    'piv' => '&#982;',
-    'ni' => '&#8715;',
-    'Phi' => '&#934;',
-    'lsaquo' => '&#8249;',
-    'Uacute' => '\xda',
-    'Omicron' => '&#927;',
-    'ang' => '&#8736;',
-    'ne' => '&#8800;',
-    'iquest' => '\xbf',
-    'eta' => '&#951;',
-    'yacute' => '\xfd',
-    'Rho' => '&#929;',
-    'uacute' => '\xfa',
-    'Alpha' => '&#913;',
-    'zeta' => '&#950;',
-    'Omega' => '&#937;',
-    'nu' => '&#957;',
-    'sim' => '&#8764;',
-    'sect' => '\xa7',
-    'phi' => '&#966;',
-    'sigmaf' => '&#962;',
-    'macr' => '\xaf',
-    'minus' => '&#8722;',
-    'Ccedil' => '\xc7',
-    'ordm' => '\xba',
-    'epsilon' => '&#949;',
-    'beta' => '&#946;',
-    'rArr' => '&#8658;',
-    'rho' => '&#961;',
-    'aacute' => '\xe1',
-    'eacute' => '\xe9',
-    'omega' => '&#969;',
-    'middot' => '\xb7',
-    'Gamma' => '&#915;',
-    'Iacute' => '\xcd',
-    'lang' => '&#9001;',
-    'spades' => '&#9824;',
-    'rsquo' => '&#8217;',
-    'uml' => '\xa8',
-    'thorn' => '\xfe',
-    'ouml' => '\xf6',
-    'thetasym' => '&#977;',
-    'or' => '&#8744;',
-    'raquo' => '\xbb',
-    'acirc' => '\xe2',
-    'ldquo' => '&#8220;',
-    'hearts' => '&#9829;',
-    'sigma' => '&#963;',
-    'oacute' => '\xf3',
-=end
+    'hellip' => '…',
+    'nbsp' => ' ',
+    'ndash' => '–',
+    'Agrave' => 'À',
+    'Aacute' => 'Á',
+    'Acirc' => 'Â',
+    'Atilde' => 'Ã',
+    'Auml' => 'Ä',
+    'Aring' => 'Å',
+    'AElig' => 'Æ',
+    'OElig' => 'Œ',
+    'Ccedil' => 'Ç',
+    'Egrave' => 'È',
+    'Eacute' => 'É',
+    'Ecirc' => 'Ê',
+    'Euml' => 'Ë',
+    'Igrave' => 'Ì',
+    'Iacute' => 'Í',
+    'Icirc' => 'Î',
+    'Iuml' => 'Ï',
+    'ETH' => 'Ð',
+    'Ntilde' => 'Ñ',
+    'Ograve' => 'Ò',
+    'Oacute' => 'Ó',
+    'Ocirc' => 'Ô',
+    'Otilde' => 'Õ',
+    'Ouml' => 'Ö',
+    'Oslash' => 'Ø',
+    'Ugrave' => 'Ù',
+    'Uacute' => 'Ú',
+    'Ucirc' => 'Û',
+    'Uuml' => 'Ü',
+    'Yacute' => 'Ý',
+    'THORN' => 'Þ',
+    'szlig' => 'ß',
+    'agrave' => 'à',
+    'aacute' => 'á',
+    'acirc' => 'â',
+    'atilde' => 'ã',
+    'auml' => 'ä',
+    'aring' => 'å',
+    'aelig' => 'æ',
+    'oelig' => 'œ',
+    'ccedil' => 'ç',
+    'egrave' => 'è',
+    'eacute' => 'é',
+    'ecirc' => 'ê',
+    'euml' => 'ë',
+    'igrave' => 'ì',
+    'iacute' => 'í',
+    'icirc' => 'î',
+    'iuml' => 'ï',
+    'eth' => 'ð',
+    'ntilde' => 'ñ',
+    'ograve' => 'ò',
+    'oacute' => 'ó',
+    'ocirc' => 'ô',
+    'otilde' => 'õ',
+    'ouml' => 'ö',
+    'oslash' => 'ø',
+    'ugrave' => 'ù',
+    'uacute' => 'ú',
+    'ucirc' => 'û',
+    'uuml' => 'ü',
+    'yacute' => 'ý',
+    'thorn' => 'þ',
+    'yuml' => 'ÿ'
          }
        end
      end
-  end
  end
  
  begin
-  require 'htmlentities'
-rescue LoadError
-  gems = nil
-  begin
-    gems = require 'rubygems'
-  rescue LoadError
-    gems = false
+  require 'hpricot'
+  module ::Irc
+    module Utils
+      AFTER_PAR_PATH = /^(?:div|span)$/
+      AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/
+      AFTER_PAR_CLASS = /body|message|text/i
+    end
    end
-  if gems
-    retry
-  else
+rescue LoadError
      module ::Irc
        module Utils
          # Some regular expressions to manage HTML data
@@ -329,13 +127,12 @@ rescue LoadError
  
          # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
          # to mark actual text
-        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
  
          # At worst, we can try stuff which is comprised between two <br>
          AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
        end
      end
-  end
  end
  
  module ::Irc
@@ -354,7 +151,7 @@ module ::Irc
      def Utils.bot=(b)
        debug "initializing utils"
        @@bot = b
-      @@safe_save_dir = "#{@@bot.botclass}/safe_save"
+      @@safe_save_dir = @@bot.path('safe_save')
      end
  
  
@@ -364,10 +161,12 @@ module ::Irc
      SEC_PER_HR = SEC_PER_MIN * 60
      # Seconds per day
      SEC_PER_DAY = SEC_PER_HR * 24
+    # Seconds per week
+    SEC_PER_WK = SEC_PER_DAY * 7
      # Seconds per (30-day) month
      SEC_PER_MNTH = SEC_PER_DAY * 30
-    # Second per (30*12 = 360 day) year
-    SEC_PER_YR = SEC_PER_MNTH * 12
+    # Second per (non-leap) year
+    SEC_PER_YR = SEC_PER_DAY * 365
  
      # Auxiliary method needed by Utils.secs_to_string
      def Utils.secs_to_string_case(array, var, string, plural)
@@ -405,28 +204,120 @@ module ::Irc
        end
      end
  
+    # Turn a number of seconds into a hours:minutes:seconds e.g.
+    # 3:18:10 or 5'12" or 7s
+    #
+    def Utils.secs_to_short(seconds)
+      secs = seconds.to_i # make sure it's an integer
+      mins, secs = secs.divmod 60
+      hours, mins = mins.divmod 60
+      if hours > 0
+        return ("%s:%s:%s" % [hours, mins, secs])
+      elsif mins > 0
+        return ("%s'%s\"" % [mins, secs])
+      else
+        return ("%ss" % [secs])
+      end
+    end
+
+    # Returns human readable time.
+    # Like: 5 days ago
+    #       about one hour ago
+    # options
+    # :start_date, sets the time to measure against, defaults to now
+    # :date_format, used with <tt>to_formatted_s<tt>, default to :default
+    def Utils.timeago(time, options = {})
+      start_date = options.delete(:start_date) || Time.new
+      date_format = options.delete(:date_format) || "%x"
+      delta = (start_date - time).round
+      if delta.abs < 2
+        _("right now")
+      else
+        distance = Utils.age_string(delta)
+        if delta < 0
+          _("%{d} from now") % {:d => distance}
+        else
+          _("%{d} ago") % {:d => distance}
+        end
+      end
+    end
+
+    # Converts age in seconds to "nn units". Inspired by previous attempts
+    # but also gitweb's age_string() sub
+    def Utils.age_string(secs)
+      case
+      when secs < 0
+        Utils.age_string(-secs)
+      when secs > 2*SEC_PER_YR
+        _("%{m} years") % { :m => secs/SEC_PER_YR }
+      when secs > 2*SEC_PER_MNTH
+        _("%{m} months") % { :m => secs/SEC_PER_MNTH }
+      when secs > 2*SEC_PER_WK
+        _("%{m} weeks") % { :m => secs/SEC_PER_WK }
+      when secs > 2*SEC_PER_DAY
+        _("%{m} days") % { :m => secs/SEC_PER_DAY }
+      when secs > 2*SEC_PER_HR
+        _("%{m} hours") % { :m => secs/SEC_PER_HR }
+      when (20*SEC_PER_MIN..40*SEC_PER_MIN).include?(secs)
+        _("half an hour")
+      when (50*SEC_PER_MIN..70*SEC_PER_MIN).include?(secs)
+        # _("about one hour")
+        _("an hour")
+      when (80*SEC_PER_MIN..100*SEC_PER_MIN).include?(secs)
+        _("an hour and a half")
+      when secs > 2*SEC_PER_MIN
+        _("%{m} minutes") % { :m => secs/SEC_PER_MIN }
+      when secs > 1
+        _("%{m} seconds") % { :m => secs }
+      else
+        _("one second")
+      end
+    end
  
      # Execute an external program, returning a String obtained by redirecting
-    # the program's standards errors and output 
+    # the program's standards errors and output
      #
+    # TODO: find a way to expose some common errors (e.g. Errno::NOENT)
+    # to the caller
      def Utils.safe_exec(command, *args)
-      IO.popen("-") { |p|
+      output = IO.popen("-") { |p|
          if p
-          return p.readlines.join("\n")
+          break p.readlines.join("\n")
          else
            begin
              $stderr.reopen($stdout)
              exec(command, *args)
            rescue Exception => e
-            puts "exec of #{command} led to exception: #{e.pretty_inspect}"
-            Kernel::exit! 0
+            puts "exception #{e.pretty_inspect} trying to run #{command}"
+            Kernel::exit! 1
            end
            puts "exec of #{command} failed"
-          Kernel::exit! 0
+          Kernel::exit! 1
          end
        }
+      raise "safe execution of #{command} returned #{$?}" unless $?.success?
+      return output
      end
  
+    # Try executing an external program, returning true if the run was successful
+    # and false otherwise
+    def Utils.try_exec(command, *args)
+      IO.popen("-") { |p|
+        if p.nil?
+          begin
+            $stderr.reopen($stdout)
+            exec(command, *args)
+          rescue Exception => e
+            Kernel::exit! 1
+          end
+          Kernel::exit! 1
+        else
+          debug p.readlines
+        end
+      }
+      debug $?
+      return $?.success?
+    end
  
      # Safely (atomically) save to _file_, by passing a tempfile to the block
      # and then moving the tempfile to its final location when done.
@@ -447,19 +338,32 @@ module ::Irc
      # Decode HTML entities in the String _str_, using HTMLEntities if the
      # package was found, or UNESCAPE_TABLE otherwise.
      #
-    def Utils.decode_html_entities(str)
-      if defined? ::HTMLEntities
-        return HTMLEntities.decode_entities(str)
+
+    if defined? ::HTMLEntities
+      if ::HTMLEntities.respond_to? :decode_entities
+        def Utils.decode_html_entities(str)
+          return HTMLEntities.decode_entities(str)
+        end
        else
-        str.gsub(/(&(.+?);)/) {
+        @@html_entities = HTMLEntities.new
+        def Utils.decode_html_entities(str)
+          return @@html_entities.decode str
+        end
+      end
+    else
+      def Utils.decode_html_entities(str)
+        return str.gsub(/(&(.+?);)/) {
            symbol = $2
            # remove the 0-paddng from unicode integers
-          if symbol =~ /#(.+)/
-            symbol = "##{$1.to_i.to_s}"
+          case symbol
+          when /^#x([0-9a-fA-F]+)$/
+            symbol = $1.to_i(16).to_s
+          when /^#(\d+)$/
+            symbol = $1.to_i.to_s
            end
  
            # output the symbol's irc-translated character, or a * if it's unknown
-          UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
+          UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [symbol.to_i].pack("U") : '*')
          }
        end
      end
@@ -488,7 +392,7 @@ module ::Irc
        # Strip styles and scripts
        (doc/"style|script").remove
  
-      debug doc.inspect
+      debug doc
  
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -498,25 +402,27 @@ module ::Irc
  
        txt = String.new
  
-      h = %w{h1 h2 h3 h4 h5 h6}
-      p = %w{p}
-      ar = []
-      h.each { |hx|
-        p.each { |px|
-          ar << "#{hx}~#{px}"
-        }
-      }
-      h_p_css = ar.join("|")
-      debug "css search: #{h_p_css}"
-
        pre_h = pars = by_span = nil
  
        while true
          debug "Minimum number of spaces: #{min_spaces}"
  
          # Initial attempt: <p> that follows <h\d>
-        pre_h = doc/h_p_css if pre_h.nil?
-        debug "Hx: found: #{pre_h.pretty_inspect}"
+        if pre_h.nil?
+          pre_h = Hpricot::Elements[]
+          found_h = false
+          doc.search("*") { |e|
+            next if e.bogusetag?
+            case e.pathname
+            when /^h\d/
+              found_h = true
+            when 'p'
+              pre_h << e if found_h
+            end
+          }
+          debug "Hx: found: #{pre_h.pretty_inspect}"
+        end
+
          pre_h.each { |p|
            debug p
            txt = p.to_html.ircify_html
@@ -548,13 +454,23 @@ module ::Irc
          # 'message' or 'text' in their class to mark actual text. Since we want
          # the class match to be partial and case insensitive, we collect
          # the common elements that may have this class and then filter out those
-        # we don't need
+        # we don't need. If no divs or spans are found, we'll accept additional
+        # elements too (td, tr, tbody, table).
          if by_span.nil?
            by_span = Hpricot::Elements[]
-          pre_pars = doc/"div|span|td|tr|tbody|table"
-          pre_pars.each { |el|
-            by_span.push el if el.class =~ /body|message|text/i
+          extra = Hpricot::Elements[]
+          doc.search("*") { |el|
+            next if el.bogusetag?
+            case el.pathname
+            when AFTER_PAR_PATH
+              by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+            when AFTER_PAR_EX
+              extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+            end
            }
+          if by_span.empty? and not extra.empty?
+            by_span.concat extra
+          end
            debug "other \#1: found: #{by_span.pretty_inspect}"
          end
  
@@ -580,7 +496,11 @@ module ::Irc
  
      # HTML first par grabber without hpricot
      def Utils.ircify_first_html_par_woh(xml_org, opts={})
-      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+      xml = xml_org.gsub(/<!--.*?-->/m,
+                         "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im,
+                         "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im,
+                         "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im,
+                         "")
  
        strip = opts[:strip]
        strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -656,6 +576,143 @@ module ::Irc
        end
      end
  
+    # This method extracts title, content (first par) and extra
+    # information from the given document _doc_.
+    #
+    # _doc_ can be an URI, a Net::HTTPResponse or a String.
+    #
+    # If _doc_ is a String, only title and content information
+    # are retrieved (if possible), using standard methods.
+    #
+    # If _doc_ is an URI or a Net::HTTPResponse, additional
+    # information is retrieved, and special title/summary
+    # extraction routines are used if possible.
+    #
+    def Utils.get_html_info(doc, opts={})
+      case doc
+      when String
+        Utils.get_string_html_info(doc, opts)
+      when Net::HTTPResponse
+        Utils.get_resp_html_info(doc, opts)
+      when URI
+        ret = DataStream.new
+        @@bot.httputil.get_response(doc) { |resp|
+          ret.replace Utils.get_resp_html_info(resp, opts)
+        }
+        return ret
+      else
+        raise
+      end
+    end
+
+    class ::UrlLinkError < RuntimeError
+    end
+
+    # This method extracts title, content (first par) and extra
+    # information from the given Net::HTTPResponse _resp_.
+    #
+    # Currently, the only accepted options (in _opts_) are
+    # uri_fragment:: the URI fragment of the original request
+    # full_body::    get the whole body instead of
+    #                @@bot.config['http.info_bytes'] bytes only
+    #
+    # Returns a DataStream with the following keys:
+    # text:: the (partial) body
+    # title:: the title of the document (if any)
+    # content:: the first paragraph of the document (if any)
+    # headers::
+    #   the headers of the Net::HTTPResponse. The value is
+    #   a Hash whose keys are lowercase forms of the HTTP
+    #   header fields, and whose values are Arrays.
+    #
+    def Utils.get_resp_html_info(resp, opts={})
+      case resp
+      when Net::HTTPSuccess
+        loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
+        if loc and loc.fragment and not loc.fragment.empty?
+          opts[:uri_fragment] ||= loc.fragment
+        end
+        ret = DataStream.new(opts.dup)
+        ret[:headers] = resp.to_hash
+        ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes'])
+
+        filtered = Utils.try_htmlinfo_filters(ret)
+
+        if filtered
+          return filtered
+        elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+          ret.merge!(Utils.get_string_html_info(partial, opts))
+        end
+        return ret
+      else
+        raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+      end
+    end
+
+    # This method runs an appropriately-crafted DataStream _ds_ through the
+    # filters in the :htmlinfo filter group, in order. If one of the filters
+    # returns non-nil, its results are merged in _ds_ and returned. Otherwise
+    # nil is returned.
+    #
+    # The input DataStream should have the downloaded HTML as primary key
+    # (:text) and possibly a :headers key holding the resonse headers.
+    #
+    def Utils.try_htmlinfo_filters(ds)
+      filters = @@bot.filter_names(:htmlinfo)
+      return nil if filters.empty?
+      cur = nil
+      # TODO filter priority
+      filters.each { |n|
+        debug "testing htmlinfo filter #{n}"
+        cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
+        debug "returned #{cur.pretty_inspect}"
+        break if cur
+      }
+      return ds.merge(cur) if cur
+    end
+
+    # HTML info filters often need to check if the webpage location
+    # of a passed DataStream _ds_ matches a given Regexp.
+    def Utils.check_location(ds, rx)
+      debug ds[:headers]
+      if h = ds[:headers]
+        loc = [h['x-rbot-location'],h['location']].flatten.grep(rx)
+      end
+      loc ||= []
+      debug loc
+      return loc.empty? ? nil : loc
+    end
+
+    # This method extracts title and content (first par)
+    # from the given HTML or XML document _text_, using
+    # standard methods (String#ircify_html_title,
+    # Utils.ircify_first_html_par)
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    def Utils.get_string_html_info(text, opts={})
+      debug "getting string html info"
+      txt = text.dup
+      title = txt.ircify_html_title
+      debug opts
+      if frag = opts[:uri_fragment] and not frag.empty?
+        fragreg = /<a\s+(?:[^>]+\s+)?(?:name|id)=["']?#{frag}["']?[^>]*>/im
+        debug fragreg
+        debug txt
+        if txt.match(fragreg)
+          # grab the post-match
+          txt = $'
+        end
+        debug txt
+      end
+      c_opts = opts.dup
+      c_opts[:strip] ||= title
+      content = Utils.ircify_first_html_par(txt, c_opts)
+      content = nil if content.empty?
+      return {:title => title, :content => content}
+    end
+
      # Get the first pars of the first _count_ _urls_.
      # The pages are downloaded using the bot httputil service.
      # Returns an array of the first paragraphs fetched.
@@ -670,31 +727,38 @@ module ::Irc
          url = urls.shift
          idx += 1
  
-        # FIXME what happens if some big file is returned? We should share
-        # code with the url plugin to only retrieve partial file content!
-        xml = self.bot.httputil.get(url)
-        if xml.nil?
-          debug "Unable to retrieve #{url}"
-          next
-        end
-        par = Utils.ircify_first_html_par(xml, opts)
-        if par.empty?
-          debug "No first par found\n#{xml}"
-          # FIXME only do this if the 'url' plugin is loaded
-          # TODO even better, put the code here
-          # par = @bot.plugins['url'].get_title_from_html(xml)
-          if par.empty?
-            retval.push(nil)
-            next
+        begin
+          info = Utils.get_html_info(URI.parse(url), opts)
+
+          par = info[:content]
+          retval.push(par)
+
+          if par
+            msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+            count -=1
            end
+        rescue
+          debug "Unable to retrieve #{url}: #{$!}"
+          next
          end
-        msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
-        count -=1
-        retval.push(par)
        end
        return retval
      end
  
+    # Returns a comma separated list except for the last element
+    # which is joined in with specified conjunction
+    #
+    def Utils.comma_list(words, options={})
+      defaults = { :join_with => ", ", :join_last_with => _(" and ") }
+      opts = defaults.merge(options)
+
+      if words.size < 2
+        words.last
+      else
+        [words[0..-2].join(opts[:join_with]), words.last].join(opts[:join_last_with])
+      end
+    end
+
    end
  end