first_html_par: build lists 'manually' when using Hpricot

[user/henk/code/ruby/rbot.git] / lib / rbot / core / utils / utils.rb
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index 8c23b2cfee0f7a908cde4b4b7ff0601a3d676071..9b678defaa0400ef68be01a8ae6fa122850db873 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -498,25 +498,26 @@ module ::Irc
  
        txt = String.new
  
-      h = %w{h1 h2 h3 h4 h5 h6}
-      p = %w{p}
-      ar = []
-      h.each { |hx|
-        p.each { |px|
-          ar << "#{hx}~#{px}"
-        }
-      }
-      h_p_css = ar.join("|")
-      debug "css search: #{h_p_css}"
-
        pre_h = pars = by_span = nil
  
        while true
          debug "Minimum number of spaces: #{min_spaces}"
  
          # Initial attempt: <p> that follows <h\d>
-        pre_h = doc/h_p_css if pre_h.nil?
-        debug "Hx: found: #{pre_h.pretty_inspect}"
+        if pre_h.nil?
+          pre_h = Hpricot::Elements[]
+          found_h = false
+          doc.root.search("*") { |e|
+            case e.pathname
+            when /^h\d/
+              found_h = true
+            when 'p'
+              pre_h << e if found_h
+            end
+          }
+          debug "Hx: found: #{pre_h.pretty_inspect}"
+        end
+
          pre_h.each { |p|
            debug p
            txt = p.to_html.ircify_html
@@ -551,9 +552,8 @@ module ::Irc
          # we don't need
          if by_span.nil?
            by_span = Hpricot::Elements[]
-          pre_pars = doc/"div|span|td|tr|tbody|table"
-          pre_pars.each { |el|
-            by_span.push el if el.class =~ /body|message|text/i
+          doc.root.each("*") { |el|
+            by_span.push el if el.pathname =~ /^(?:div|span|td|tr|tbody|table)$/ and el[:class] =~ /body|message|text/i
            }
            debug "other \#1: found: #{by_span.pretty_inspect}"
          end
@@ -736,7 +736,9 @@ module ::Irc
          fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
          txt.sub!(fragreg,'')
        end
-      content = Utils.ircify_first_html_par(txt, :strip => title)
+      c_opts = opts.dup
+      c_opts[:strip] ||= title
+      content = Utils.ircify_first_html_par(txt, c_opts)
        content = nil if content.empty?
        return {:title => title, :content => content}
      end
@@ -755,27 +757,20 @@ module ::Irc
          url = urls.shift
          idx += 1
  
-        # FIXME what happens if some big file is returned? We should share
-        # code with the url plugin to only retrieve partial file content!
-        xml = self.bot.httputil.get(url)
-        if xml.nil?
-          debug "Unable to retrieve #{url}"
-          next
-        end
-        par = Utils.ircify_first_html_par(xml, opts)
-        if par.empty?
-          debug "No first par found\n#{xml}"
-          # FIXME only do this if the 'url' plugin is loaded
-          # TODO even better, put the code here
-          # par = @bot.plugins['url'].get_title_from_html(xml)
-          if par.empty?
-            retval.push(nil)
-            next
+        begin
+          info = Utils.get_html_info(URI.parse(url), opts)
+
+          par = info[:content]
+          retval.push(par)
+
+          if par
+            msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+            count -=1
            end
+        rescue
+          debug "Unable to retrieve #{url}: #{$!}"
+          next
          end
-        msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
-        count -=1
-        retval.push(par)
        end
        return retval
      end