X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Futils.rb;h=ac60735d352273212a8681dc631ed05aca856fb6;hb=edd1cf77be07ae507014574141e920ad23eb164d;hp=fc89e1c3f7146077945e5794de02981b93d2b2d5;hpb=e935773b3e115d2d33e6d32f488578c650428ed2;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index fc89e1c3..ac60735d 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -1,3 +1,17 @@ +#-- vim:sw=2:et +#++ +# +# :title: rbot utilities provider +# +# Author:: Tom Gilbert +# Author:: Giuseppe "Oblomov" Bilotta +# +# Copyright:: (C) 2002-2006 Tom Gilbert +# Copyright:: (C) 2007 Giuseppe Bilotta +# +# TODO some of these Utils should be rewritten as extensions to the approriate +# standard Ruby classes and accordingly be moved to extends.rb + require 'net/http' require 'uri' require 'tempfile' @@ -345,7 +359,7 @@ module ::Irc end - @@safe_save_dir = nil + @@safe_save_dir = nil unless defined?(@@safe_save_dir) def Utils.set_safe_save_dir(str) @@safe_save_dir = str.dup end @@ -415,5 +429,81 @@ module ::Irc } end end + + H1_REGEX = /]*)?>(.*?)<\/h1>/im + PAR_REGEX = /]*)?>.*?<\/p>/im + # Try to grab and IRCify the first HTML par (

tag) in the given string. + # If possible, grab the one after the first h1 heading + # + # It is possible to pass some options to determine how the stripping + # occurs. Currently, only one option is supported: + # * :strip => Regex or String to strip at the beginning of the obtained + # text + # + def Utils.ircify_first_html_par(xml, opts={}) + txt = String.new + strip = opts[:strip] + strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) + + header_found = xml.match(H1_REGEX) + if header_found + header_found = $' + debug "Found header: #{header_found[1].inspect}" + while txt.empty? + candidate = header_found[PAR_REGEX] + break unless candidate + txt = candidate.ircify_html + header_found = $' + txt.sub!(strip, '') if strip + end + end + + # If we haven't found a first par yet, try to get it from the whole + # document + if txt.empty? + header_found = xml + while txt.empty? + candidate = header_found[PAR_REGEX] + break unless candidate + txt = candidate.ircify_html + header_found = $' + txt.sub!(strip, '') if strip + end + end + return txt + end + + # Get the first pars of the first _count_ _urls_. + # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util, + # and echoed as replies to the IRC message passed as _opts_ :message. + # + def Utils.get_first_pars(urls, count, opts={}) + idx = 0 + msg = opts[:message] + while count > 0 and urls.length > 0 + url = urls.shift + idx += 1 + + # FIXME what happens if some big file is returned? We should share + # code with the url plugin to only retrieve partial file content! + xml = opts[:http_util].get_cached(url) + if xml.nil? + debug "Unable to retrieve #{url}" + next + end + par = Utils.ircify_first_html_par(xml, opts) + if par.empty? + debug "No first par found\n#{xml}" + # FIXME only do this if the 'url' plugin is loaded + # TODO even better, put the code here + # par = @bot.plugins['url'].get_title_from_html(xml) + next if par.empty? + end + msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg + count -=1 + end + end + + end end