first_html_par: build lists 'manually' when using Hpricot

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index 1c27aa023fc1de39b0f050ac4d286970a076bd79..9b678defaa0400ef68be01a8ae6fa122850db873 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -15,9 +15,9 @@
  require 'tempfile'
  require 'set'
  
+# Try to load htmlentities, fall back to an HTML escape table.
  begin
    require 'htmlentities'
-  $we_have_html_entities_decoder = true
  rescue LoadError
    gems = nil
    begin
@@ -28,7 +28,6 @@ rescue LoadError
    if gems
      retry
    else
-    $we_have_html_entities_decoder = false
      module ::Irc
        module Utils
          UNESCAPE_TABLE = {
@@ -304,18 +303,54 @@ rescue LoadError
    end
  end
  
+begin
+  require 'htmlentities'
+rescue LoadError
+  gems = nil
+  begin
+    gems = require 'rubygems'
+  rescue LoadError
+    gems = false
+  end
+  if gems
+    retry
+  else
+    module ::Irc
+      module Utils
+        # Some regular expressions to manage HTML data
+
+        # Title
+        TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
+
+        # H1, H2, etc
+        HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
+        # A paragraph
+        PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+        # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
+        # to mark actual text
+        AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+        # At worst, we can try stuff which is comprised between two <br>
+        AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
+      end
+    end
+  end
+end
  
  module ::Irc
  
-  # miscellaneous useful functions
+  # Miscellaneous useful functions
    module Utils
      @@bot = nil unless defined? @@bot
      @@safe_save_dir = nil unless defined?(@@safe_save_dir)
  
+    # The bot instance
      def Utils.bot
        @@bot
      end
  
+    # Set up some Utils routines which depend on the associated bot.
      def Utils.bot=(b)
        debug "initializing utils"
        @@bot = b
@@ -323,12 +358,18 @@ module ::Irc
      end
  
  
+    # Seconds per minute
      SEC_PER_MIN = 60
+    # Seconds per hour
      SEC_PER_HR = SEC_PER_MIN * 60
+    # Seconds per day
      SEC_PER_DAY = SEC_PER_HR * 24
+    # Seconds per (30-day) month
      SEC_PER_MNTH = SEC_PER_DAY * 30
+    # Second per (30*12 = 360 day) year
      SEC_PER_YR = SEC_PER_MNTH * 12
  
+    # Auxiliary method needed by Utils.secs_to_string
      def Utils.secs_to_string_case(array, var, string, plural)
        case var
        when 1
@@ -338,43 +379,46 @@ module ::Irc
        end
      end
  
-    # turn a number of seconds into a human readable string, e.g
-    # 2 days, 3 hours, 18 minutes, 10 seconds
+    # Turn a number of seconds into a human readable string, e.g
+    # 2 days, 3 hours, 18 minutes and 10 seconds
      def Utils.secs_to_string(secs)
        ret = []
        years, secs = secs.divmod SEC_PER_YR
-      secs_to_string_case(ret, years, "year", "years") if years > 0
+      secs_to_string_case(ret, years, _("year"), _("years")) if years > 0
        months, secs = secs.divmod SEC_PER_MNTH
-      secs_to_string_case(ret, months, "month", "months") if months > 0
+      secs_to_string_case(ret, months, _("month"), _("months")) if months > 0
        days, secs = secs.divmod SEC_PER_DAY
-      secs_to_string_case(ret, days, "day", "days") if days > 0
+      secs_to_string_case(ret, days, _("day"), _("days")) if days > 0
        hours, secs = secs.divmod SEC_PER_HR
-      secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
+      secs_to_string_case(ret, hours, _("hour"), _("hours")) if hours > 0
        mins, secs = secs.divmod SEC_PER_MIN
-      secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
+      secs_to_string_case(ret, mins, _("minute"), _("minutes")) if mins > 0
        secs = secs.to_i
-      secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
+      secs_to_string_case(ret, secs, _("second"), _("seconds")) if secs > 0 or ret.empty?
        case ret.length
        when 0
          raise "Empty ret array!"
        when 1
          return ret.to_s
        else
-        return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
+        return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
        end
      end
  
  
+    # Execute an external program, returning a String obtained by redirecting
+    # the program's standards errors and output 
+    #
      def Utils.safe_exec(command, *args)
-      IO.popen("-") {|p|
-        if(p)
+      IO.popen("-") { |p|
+        if p
            return p.readlines.join("\n")
          else
            begin
              $stderr.reopen($stdout)
              exec(command, *args)
            rescue Exception => e
-            puts "exec of #{command} led to exception: #{e.inspect}"
+            puts "exec of #{command} led to exception: #{e.pretty_inspect}"
              Kernel::exit! 0
            end
            puts "exec of #{command} failed"
@@ -384,6 +428,11 @@ module ::Irc
      end
  
  
+    # Safely (atomically) save to _file_, by passing a tempfile to the block
+    # and then moving the tempfile to its final location when done.
+    #
+    # call-seq: Utils.safe_save(file, &block)
+    #
      def Utils.safe_save(file)
        raise 'No safe save directory defined!' if @@safe_save_dir.nil?
        basename = File.basename(file)
@@ -395,8 +444,11 @@ module ::Irc
      end
  
  
+    # Decode HTML entities in the String _str_, using HTMLEntities if the
+    # package was found, or UNESCAPE_TABLE otherwise.
+    #
      def Utils.decode_html_entities(str)
-      if $we_have_html_entities_decoder
+      if defined? ::HTMLEntities
          return HTMLEntities.decode_entities(str)
        else
          str.gsub(/(&(.+?);)/) {
@@ -412,26 +464,122 @@ module ::Irc
        end
      end
  
-    HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
-    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
-    # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
-    # to mark actual text
-    AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
-    # At worst, we can try stuff which is comprised between two <br>
-    AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
-
      # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
      # If possible, grab the one after the first heading
      #
      # It is possible to pass some options to determine how the stripping
      # occurs. Currently supported options are
-    #   * :strip => Regex or String to strip at the beginning of the obtained
-    #               text
-    #   * :min_spaces => Minimum number of spaces a paragraph should have
+    # strip:: Regex or String to strip at the beginning of the obtained
+    #         text
+    # min_spaces:: minimum number of spaces a paragraph should have
      #
      def Utils.ircify_first_html_par(xml_org, opts={})
+      if defined? ::Hpricot
+        Utils.ircify_first_html_par_wh(xml_org, opts)
+      else
+        Utils.ircify_first_html_par_woh(xml_org, opts)
+      end
+    end
+
+    # HTML first par grabber using hpricot
+    def Utils.ircify_first_html_par_wh(xml_org, opts={})
+      doc = Hpricot(xml_org)
+
+      # Strip styles and scripts
+      (doc/"style|script").remove
+
+      debug doc.inspect
+
+      strip = opts[:strip]
+      strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+      min_spaces = opts[:min_spaces] || 8
+      min_spaces = 0 if min_spaces < 0
+
+      txt = String.new
+
+      pre_h = pars = by_span = nil
+
+      while true
+        debug "Minimum number of spaces: #{min_spaces}"
+
+        # Initial attempt: <p> that follows <h\d>
+        if pre_h.nil?
+          pre_h = Hpricot::Elements[]
+          found_h = false
+          doc.root.search("*") { |e|
+            case e.pathname
+            when /^h\d/
+              found_h = true
+            when 'p'
+              pre_h << e if found_h
+            end
+          }
+          debug "Hx: found: #{pre_h.pretty_inspect}"
+        end
+
+        pre_h.each { |p|
+          debug p
+          txt = p.to_html.ircify_html
+          txt.sub!(strip, '') if strip
+          debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+          break unless txt.empty? or txt.count(" ") < min_spaces
+        }
+
+        return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+        # Second natural attempt: just get any <p>
+        pars = doc/"p" if pars.nil?
+        debug "par: found: #{pars.pretty_inspect}"
+        pars.each { |p|
+          debug p
+          txt = p.to_html.ircify_html
+          txt.sub!(strip, '') if strip
+          debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+          break unless txt.empty? or txt.count(" ") < min_spaces
+        }
+
+        return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+        # Nothing yet ... let's get drastic: we look for non-par elements too,
+        # but only for those that match something that we know is likely to
+        # contain text
+
+        # Some blogging and forum platforms use spans or divs with a 'body' or
+        # 'message' or 'text' in their class to mark actual text. Since we want
+        # the class match to be partial and case insensitive, we collect
+        # the common elements that may have this class and then filter out those
+        # we don't need
+        if by_span.nil?
+          by_span = Hpricot::Elements[]
+          doc.root.each("*") { |el|
+            by_span.push el if el.pathname =~ /^(?:div|span|td|tr|tbody|table)$/ and el[:class] =~ /body|message|text/i
+          }
+          debug "other \#1: found: #{by_span.pretty_inspect}"
+        end
+
+        by_span.each { |p|
+          debug p
+          txt = p.to_html.ircify_html
+          txt.sub!(strip, '') if strip
+          debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+          break unless txt.empty? or txt.count(" ") < min_spaces
+        }
+
+        return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+        # At worst, we can try stuff which is comprised between two <br>
+        # TODO
+
+        debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
+        return txt unless txt.count(" ") < min_spaces
+        break if min_spaces == 0
+        min_spaces /= 2
+      end
+    end
+
+    # HTML first par grabber without hpricot
+    def Utils.ircify_first_html_par_woh(xml_org, opts={})
        xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
  
        strip = opts[:strip]
@@ -503,10 +651,98 @@ module ::Irc
  
          debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
          return txt unless txt.count(" ") < min_spaces
+        break if min_spaces == 0
          min_spaces /= 2
        end
      end
  
+    # This method extracts title, content (first par) and extra
+    # information from the given document _doc_.
+    #
+    # _doc_ can be an URI, a Net::HTTPResponse or a String.
+    #
+    # If _doc_ is a String, only title and content information
+    # are retrieved (if possible), using standard methods.
+    #
+    # If _doc_ is an URI or a Net::HTTPResponse, additional
+    # information is retrieved, and special title/summary
+    # extraction routines are used if possible.
+    #
+    def Utils.get_html_info(doc, opts={})
+      case doc
+      when String
+        Utils.get_string_html_info(doc, opts)
+      when Net::HTTPResponse
+        Utils.get_resp_html_info(doc, opts)
+      when URI
+        if doc.fragment and not doc.fragment.empty?
+          opts[:uri_fragment] ||= doc.fragment
+        end
+        ret = Hash.new
+        @@bot.httputil.get_response(doc) { |resp|
+          ret = Utils.get_resp_html_info(resp, opts)
+        }
+        return ret
+      else
+        raise
+      end
+    end
+
+    class ::UrlLinkError < RuntimeError
+    end
+
+    # This method extracts title, content (first par) and extra
+    # information from the given Net::HTTPResponse _resp_.
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    # Returns a Hash with the following keys:
+    # title:: the title of the document (if any)
+    # content:: the first paragraph of the document (if any)
+    # headers::
+    #   the headers of the Net::HTTPResponse. The value is
+    #   a Hash whose keys are lowercase forms of the HTTP
+    #   header fields, and whose values are Arrays.
+    #
+    def Utils.get_resp_html_info(resp, opts={})
+      ret = Hash.new
+      case resp
+      when Net::HTTPSuccess
+        ret[:headers] = resp.to_hash
+
+        if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+          partial = resp.partial_body(@@bot.config['http.info_bytes'])
+          ret.merge!(Utils.get_string_html_info(partial, opts))
+        end
+        return ret
+      else
+        raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+      end
+    end
+
+    # This method extracts title and content (first par)
+    # from the given HTML or XML document _text_, using
+    # standard methods (String#ircify_html_title,
+    # Utils.ircify_first_html_par)
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    def Utils.get_string_html_info(text, opts={})
+      txt = text.dup
+      title = txt.ircify_html_title
+      if frag = opts[:uri_fragment] and not frag.empty?
+        fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
+        txt.sub!(fragreg,'')
+      end
+      c_opts = opts.dup
+      c_opts[:strip] ||= title
+      content = Utils.ircify_first_html_par(txt, c_opts)
+      content = nil if content.empty?
+      return {:title => title, :content => content}
+    end
+
      # Get the first pars of the first _count_ _urls_.
      # The pages are downloaded using the bot httputil service.
      # Returns an array of the first paragraphs fetched.
@@ -521,33 +757,25 @@ module ::Irc
          url = urls.shift
          idx += 1
  
-        # FIXME what happens if some big file is returned? We should share
-        # code with the url plugin to only retrieve partial file content!
-        xml = self.bot.httputil.get(url)
-        if xml.nil?
-          debug "Unable to retrieve #{url}"
-          next
-        end
-        par = Utils.ircify_first_html_par(xml, opts)
-        if par.empty?
-          debug "No first par found\n#{xml}"
-          # FIXME only do this if the 'url' plugin is loaded
-          # TODO even better, put the code here
-          # par = @bot.plugins['url'].get_title_from_html(xml)
-          if par.empty?
-            retval.push(nil)
-            next
+        begin
+          info = Utils.get_html_info(URI.parse(url), opts)
+
+          par = info[:content]
+          retval.push(par)
+
+          if par
+            msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+            count -=1
            end
+        rescue
+          debug "Unable to retrieve #{url}: #{$!}"
+          next
          end
-        msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
-        count -=1
-        retval.push(par)
        end
        return retval
      end
  
-
    end
  end
  
-Irc::Utils.bot = Irc::Plugins.manager.bot
+Irc::Utils.bot = Irc::Bot::Plugins.manager.bot