+#-- vim:sw=2:et
+#++
+#
+# :title: rbot utilities provider
+#
+# Author:: Tom Gilbert <tom@linuxbrit.co.uk>
+# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
+#
+# Copyright:: (C) 2002-2006 Tom Gilbert
+# Copyright:: (C) 2007 Giuseppe Bilotta
+#
+# TODO some of these Utils should be rewritten as extensions to the approriate
+# standard Ruby classes and accordingly be moved to extends.rb
+
require 'net/http'
require 'uri'
require 'tempfile'
end
- @@safe_save_dir = nil
+ @@safe_save_dir = nil unless defined?(@@safe_save_dir)
def Utils.set_safe_save_dir(str)
@@safe_save_dir = str.dup
end
}
end
end
+
+ H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
+ PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
+ # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
+ # If possible, grab the one after the first h1 heading
+ #
+ # It is possible to pass some options to determine how the stripping
+ # occurs. Currently, only one option is supported:
+ # * :strip => Regex or String to strip at the beginning of the obtained
+ # text
+ #
+ def Utils.ircify_first_html_par(xml, opts={})
+ txt = String.new
+ strip = opts[:strip]
+ strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+ header_found = xml.match(H1_REGEX)
+ if header_found
+ header_found = $'
+ debug "Found header: #{header_found[1].inspect}"
+ while txt.empty?
+ candidate = header_found[PAR_REGEX]
+ break unless candidate
+ txt = candidate.ircify_html
+ header_found = $'
+ txt.sub!(strip, '') if strip
+ end
+ end
+
+ # If we haven't found a first par yet, try to get it from the whole
+ # document
+ if txt.empty?
+ header_found = xml
+ while txt.empty?
+ candidate = header_found[PAR_REGEX]
+ break unless candidate
+ txt = candidate.ircify_html
+ header_found = $'
+ txt.sub!(strip, '') if strip
+ end
+ end
+ return txt
+ end
+
+ # Get the first pars of the first _count_ _urls_.
+ # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
+ # and echoed as replies to the IRC message passed as _opts_ :message.
+ #
+ def Utils.get_first_pars(urls, count, opts={})
+ idx = 0
+ msg = opts[:message]
+ while count > 0 and urls.length > 0
+ url = urls.shift
+ idx += 1
+
+ # FIXME what happens if some big file is returned? We should share
+ # code with the url plugin to only retrieve partial file content!
+ xml = opts[:http_util].get_cached(url)
+ if xml.nil?
+ debug "Unable to retrieve #{url}"
+ next
+ end
+ par = Utils.ircify_first_html_par(xml, opts)
+ if par.empty?
+ debug "No first par found\n#{xml}"
+ # FIXME only do this if the 'url' plugin is loaded
+ # TODO even better, put the code here
+ # par = @bot.plugins['url'].get_title_from_html(xml)
+ next if par.empty?
+ end
+ msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+ count -=1
+ end
+ end
+
+
end
end