X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Futils.rb;h=ac60735d352273212a8681dc631ed05aca856fb6;hb=edd1cf77be07ae507014574141e920ad23eb164d;hp=fc89e1c3f7146077945e5794de02981b93d2b2d5;hpb=e935773b3e115d2d33e6d32f488578c650428ed2;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git

diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index fc89e1c3..ac60735d 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -1,3 +1,17 @@
+#-- vim:sw=2:et
+#++
+#
+# :title: rbot utilities provider
+#
+# Author:: Tom Gilbert <tom@linuxbrit.co.uk>
+# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
+#
+# Copyright:: (C) 2002-2006 Tom Gilbert
+# Copyright:: (C) 2007 Giuseppe Bilotta
+#
+# TODO some of these Utils should be rewritten as extensions to the approriate
+# standard Ruby classes and accordingly be moved to extends.rb
+
 require 'net/http'
 require 'uri'
 require 'tempfile'
@@ -345,7 +359,7 @@ module ::Irc
     end
 
 
-    @@safe_save_dir = nil
+    @@safe_save_dir = nil unless defined?(@@safe_save_dir)
     def Utils.set_safe_save_dir(str)
       @@safe_save_dir = str.dup
     end
@@ -415,5 +429,81 @@ module ::Irc
         }
       end
     end
+
+    H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
+    PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
+    # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
+    # If possible, grab the one after the first h1 heading
+    #
+    # It is possible to pass some options to determine how the stripping
+    # occurs. Currently, only one option is supported:
+    #   * :strip => Regex or String to strip at the beginning of the obtained
+    #               text
+    #
+    def Utils.ircify_first_html_par(xml, opts={})
+      txt = String.new
+      strip = opts[:strip]
+      strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+      header_found = xml.match(H1_REGEX)
+      if header_found
+        header_found = $'
+        debug "Found header: #{header_found[1].inspect}"
+        while txt.empty? 
+          candidate = header_found[PAR_REGEX]
+          break unless candidate
+          txt = candidate.ircify_html
+          header_found = $'
+	  txt.sub!(strip, '') if strip
+        end
+      end
+
+      # If we haven't found a first par yet, try to get it from the whole
+      # document
+      if txt.empty?
+	header_found = xml
+        while txt.empty? 
+          candidate = header_found[PAR_REGEX]
+          break unless candidate
+          txt = candidate.ircify_html
+          header_found = $'
+	  txt.sub!(strip, '') if strip
+        end
+      end
+      return txt
+    end
+
+    # Get the first pars of the first _count_ _urls_.
+    # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
+    # and echoed as replies to the IRC message passed as _opts_ :message.
+    #
+    def Utils.get_first_pars(urls, count, opts={})
+      idx = 0
+      msg = opts[:message]
+      while count > 0 and urls.length > 0
+        url = urls.shift
+        idx += 1
+
+        # FIXME what happens if some big file is returned? We should share
+        # code with the url plugin to only retrieve partial file content!
+        xml = opts[:http_util].get_cached(url)
+        if xml.nil?
+          debug "Unable to retrieve #{url}"
+          next
+        end
+        par = Utils.ircify_first_html_par(xml, opts)
+        if par.empty?
+          debug "No first par found\n#{xml}"
+          # FIXME only do this if the 'url' plugin is loaded
+          # TODO even better, put the code here
+          # par = @bot.plugins['url'].get_title_from_html(xml)
+          next if par.empty?
+        end
+        msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+        count -=1
+      end
+    end
+
+
   end
 end