utils: don't hang when getting first par with nonexistant uri-fragment

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index 535ae190179c1bfbfa577a180d42baa24014f079..ce5cdea4917cabafa55f88a5b8896accf1054898 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -223,6 +223,22 @@ module ::Irc
        end
      end
  
+    # Turn a number of seconds into a hours:minutes:seconds e.g.
+    # 3:18:10 or 5'12" or 7s
+    #
+    def Utils.secs_to_short(seconds)
+      secs = seconds.to_i # make sure it's an integer
+      mins, secs = secs.divmod 60
+      hours, mins = mins.divmod 60
+      if hours > 0
+        return ("%s:%s:%s" % [hours, mins, secs])
+      elsif mins > 0
+        return ("%s'%s\"" % [mins, secs])
+      else
+        return ("%ss" % [secs])
+      end
+    end
+
  
      # Execute an external program, returning a String obtained by redirecting
      # the program's standards errors and output 
@@ -272,12 +288,12 @@ module ::Irc
          str.gsub(/(&(.+?);)/) {
            symbol = $2
            # remove the 0-paddng from unicode integers
-          if symbol =~ /#(.+)/
-            symbol = "##{$1.to_i.to_s}"
+          if symbol =~ /^#(\d+)$/
+            symbol = $1.to_i.to_s
            end
  
            # output the symbol's irc-translated character, or a * if it's unknown
-          UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [$0.to_i].pack("U") : '*')
+          UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [symbol.to_i].pack("U") : '*')
          }
        end
      end
@@ -505,9 +521,9 @@ module ::Irc
        when Net::HTTPResponse
          Utils.get_resp_html_info(doc, opts)
        when URI
-        ret = Hash.new
+        ret = DataStream.new
          @@bot.httputil.get_response(doc) { |resp|
-          ret = Utils.get_resp_html_info(resp, opts)
+          ret.replace Utils.get_resp_html_info(resp, opts)
          }
          return ret
        else
@@ -521,10 +537,13 @@ module ::Irc
      # This method extracts title, content (first par) and extra
      # information from the given Net::HTTPResponse _resp_.
      #
-    # Currently, the only accepted option (in _opts_) is
+    # Currently, the only accepted options (in _opts_) are
      # uri_fragment:: the URI fragment of the original request
+    # full_body::    get the whole body instead of
+    #                @@bot.config['http.info_bytes'] bytes only
      #
-    # Returns a Hash with the following keys:
+    # Returns a DataStream with the following keys:
+    # text:: the (partial) body
      # title:: the title of the document (if any)
      # content:: the first paragraph of the document (if any)
      # headers::
@@ -533,17 +552,21 @@ module ::Irc
      #   header fields, and whose values are Arrays.
      #
      def Utils.get_resp_html_info(resp, opts={})
-      ret = Hash.new
        case resp
        when Net::HTTPSuccess
+        loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
+        if loc and loc.fragment and not loc.fragment.empty?
+          opts[:uri_fragment] ||= loc.fragment
+        end
+        ret = DataStream.new(opts.dup)
          ret[:headers] = resp.to_hash
+        ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes'])
  
-        partial = resp.partial_body(@@bot.config['http.info_bytes'])
-        if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
-          loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
-          if loc and loc.fragment and not loc.fragment.empty?
-            opts[:uri_fragment] ||= loc.fragment
-          end
+        filtered = Utils.try_htmlinfo_filters(ret)
+
+        if filtered
+          return filtered
+        elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/
            ret.merge!(Utils.get_string_html_info(partial, opts))
          end
          return ret
@@ -552,6 +575,40 @@ module ::Irc
        end
      end
  
+    # This method runs an appropriately-crafted DataStream _ds_ through the
+    # filters in the :htmlinfo filter group, in order. If one of the filters
+    # returns non-nil, its results are merged in _ds_ and returned. Otherwise
+    # nil is returned.
+    #
+    # The input DataStream shuold have the downloaded HTML as primary key
+    # (:text) and possibly a :headers key holding the resonse headers.
+    #
+    def Utils.try_htmlinfo_filters(ds)
+      filters = @@bot.filter_names(:htmlinfo)
+      return nil if filters.empty?
+      cur = nil
+      # TODO filter priority
+      filters.each { |n|
+        debug "testing filter #{n}"
+        cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
+        debug "returned #{cur.pretty_inspect}"
+        break if cur
+      }
+      return ds.merge(cur) if cur
+    end
+
+    # HTML info filters often need to check if the webpage location
+    # of a passed DataStream _ds_ matches a given Regexp.
+    def Utils.check_location(ds, rx)
+      debug ds[:headers]
+      if h = ds[:headers]
+        loc = [h['x-rbot-location'],h['location']].flatten.grep(rx)
+      end
+      loc ||= []
+      debug loc
+      return loc.empty? ? nil : loc
+    end
+
      # This method extracts title and content (first par)
      # from the given HTML or XML document _text_, using
      # standard methods (String#ircify_html_title,
@@ -561,11 +618,18 @@ module ::Irc
      # uri_fragment:: the URI fragment of the original request
      #
      def Utils.get_string_html_info(text, opts={})
+      debug "getting string html info"
        txt = text.dup
        title = txt.ircify_html_title
+      debug opts
        if frag = opts[:uri_fragment] and not frag.empty?
-        fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
-        txt.sub!(fragreg,'')
+        fragreg = /<a\s+[^>]*name=["']?#{frag}["']?[^>]*>/im
+        debug fragreg
+        debug txt
+        if txt.match(fragreg)
+          # grab the post-match
+          txt = $'
+        end
        end
        c_opts = opts.dup
        c_opts[:strip] ||= title