X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Futils.rb;h=7b316ffe28cd3df6ffc6ecd0d1707063142dd1bb;hb=bf9734ff89a238c5a63015b68eabd8d0ef9d1308;hp=535ae190179c1bfbfa577a180d42baa24014f079;hpb=72cece8d09ae0f2835eed996e15ddfe3cb95045f;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 535ae190..7b316ffe 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -6,9 +6,6 @@ # Author:: Tom Gilbert # Author:: Giuseppe "Oblomov" Bilotta # -# Copyright:: (C) 2002-2006 Tom Gilbert -# Copyright:: (C) 2007 Giuseppe Bilotta -# # TODO some of these Utils should be rewritten as extensions to the approriate # standard Ruby classes and accordingly be moved to extends.rb @@ -19,15 +16,6 @@ require 'set' begin require 'htmlentities' rescue LoadError - gems = nil - begin - gems = require 'rubygems' - rescue LoadError - gems = false - end - if gems - retry - else module ::Irc module Utils UNESCAPE_TABLE = { @@ -35,6 +23,7 @@ rescue LoadError 'raquo' => '»', 'quot' => '"', 'apos' => '\'', + 'deg' => '°', 'micro' => 'µ', 'copy' => '©', 'trade' => '™', @@ -44,6 +33,7 @@ rescue LoadError 'gt' => '>', 'hellip' => '…', 'nbsp' => ' ', + 'ndash' => '–', 'Agrave' => 'À', 'Aacute' => 'Á', 'Acirc' => 'Â', @@ -111,7 +101,6 @@ rescue LoadError } end end - end end begin @@ -124,15 +113,6 @@ begin end end rescue LoadError - gems = nil - begin - gems = require 'rubygems' - rescue LoadError - gems = false - end - if gems - retry - else module ::Irc module Utils # Some regular expressions to manage HTML data @@ -147,13 +127,12 @@ rescue LoadError # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class # to mark actual text - AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # At worst, we can try stuff which is comprised between two
AFTER_PAR2_REGEX = /]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im end end - end end module ::Irc @@ -172,7 +151,7 @@ module ::Irc def Utils.bot=(b) debug "initializing utils" @@bot = b - @@safe_save_dir = "#{@@bot.botclass}/safe_save" + @@safe_save_dir = @@bot.path('safe_save') end @@ -182,10 +161,12 @@ module ::Irc SEC_PER_HR = SEC_PER_MIN * 60 # Seconds per day SEC_PER_DAY = SEC_PER_HR * 24 + # Seconds per week + SEC_PER_WK = SEC_PER_DAY * 7 # Seconds per (30-day) month SEC_PER_MNTH = SEC_PER_DAY * 30 - # Second per (30*12 = 360 day) year - SEC_PER_YR = SEC_PER_MNTH * 12 + # Second per (non-leap) year + SEC_PER_YR = SEC_PER_DAY * 365 # Auxiliary method needed by Utils.secs_to_string def Utils.secs_to_string_case(array, var, string, plural) @@ -223,28 +204,120 @@ module ::Irc end end + # Turn a number of seconds into a hours:minutes:seconds e.g. + # 3:18:10 or 5'12" or 7s + # + def Utils.secs_to_short(seconds) + secs = seconds.to_i # make sure it's an integer + mins, secs = secs.divmod 60 + hours, mins = mins.divmod 60 + if hours > 0 + return ("%s:%s:%s" % [hours, mins, secs]) + elsif mins > 0 + return ("%s'%s\"" % [mins, secs]) + else + return ("%ss" % [secs]) + end + end + + # Returns human readable time. + # Like: 5 days ago + # about one hour ago + # options + # :start_date, sets the time to measure against, defaults to now + # :date_format, used with to_formatted_s, default to :default + def Utils.timeago(time, options = {}) + start_date = options.delete(:start_date) || Time.new + date_format = options.delete(:date_format) || "%x" + delta = (start_date - time).round + if delta.abs < 2 + _("right now") + else + distance = Utils.age_string(delta) + if delta < 0 + _("%{d} from now") % {:d => distance} + else + _("%{d} ago") % {:d => distance} + end + end + end + + # Converts age in seconds to "nn units". Inspired by previous attempts + # but also gitweb's age_string() sub + def Utils.age_string(secs) + case + when secs < 0 + Utils.age_string(-secs) + when secs > 2*SEC_PER_YR + _("%{m} years") % { :m => secs/SEC_PER_YR } + when secs > 2*SEC_PER_MNTH + _("%{m} months") % { :m => secs/SEC_PER_MNTH } + when secs > 2*SEC_PER_WK + _("%{m} weeks") % { :m => secs/SEC_PER_WK } + when secs > 2*SEC_PER_DAY + _("%{m} days") % { :m => secs/SEC_PER_DAY } + when secs > 2*SEC_PER_HR + _("%{m} hours") % { :m => secs/SEC_PER_HR } + when (20*SEC_PER_MIN..40*SEC_PER_MIN).include?(secs) + _("half an hour") + when (50*SEC_PER_MIN..70*SEC_PER_MIN).include?(secs) + # _("about one hour") + _("an hour") + when (80*SEC_PER_MIN..100*SEC_PER_MIN).include?(secs) + _("an hour and a half") + when secs > 2*SEC_PER_MIN + _("%{m} minutes") % { :m => secs/SEC_PER_MIN } + when secs > 1 + _("%{m} seconds") % { :m => secs } + else + _("one second") + end + end # Execute an external program, returning a String obtained by redirecting - # the program's standards errors and output + # the program's standards errors and output # + # TODO: find a way to expose some common errors (e.g. Errno::NOENT) + # to the caller def Utils.safe_exec(command, *args) - IO.popen("-") { |p| + output = IO.popen("-") { |p| if p - return p.readlines.join("\n") + break p.readlines.join("\n") else begin $stderr.reopen($stdout) exec(command, *args) rescue Exception => e - puts "exec of #{command} led to exception: #{e.pretty_inspect}" - Kernel::exit! 0 + puts "exception #{e.pretty_inspect} trying to run #{command}" + Kernel::exit! 1 end puts "exec of #{command} failed" - Kernel::exit! 0 + Kernel::exit! 1 end } + raise "safe execution of #{command} returned #{$?}" unless $?.success? + return output end + # Try executing an external program, returning true if the run was successful + # and false otherwise + def Utils.try_exec(command, *args) + IO.popen("-") { |p| + if p.nil? + begin + $stderr.reopen($stdout) + exec(command, *args) + rescue Exception => e + Kernel::exit! 1 + end + Kernel::exit! 1 + else + debug p.readlines + end + } + debug $? + return $?.success? + end # Safely (atomically) save to _file_, by passing a tempfile to the block # and then moving the tempfile to its final location when done. @@ -265,19 +338,32 @@ module ::Irc # Decode HTML entities in the String _str_, using HTMLEntities if the # package was found, or UNESCAPE_TABLE otherwise. # - def Utils.decode_html_entities(str) - if defined? ::HTMLEntities - return HTMLEntities.decode_entities(str) + + if defined? ::HTMLEntities + if ::HTMLEntities.respond_to? :decode_entities + def Utils.decode_html_entities(str) + return HTMLEntities.decode_entities(str) + end else - str.gsub(/(&(.+?);)/) { + @@html_entities = HTMLEntities.new + def Utils.decode_html_entities(str) + return @@html_entities.decode str + end + end + else + def Utils.decode_html_entities(str) + return str.gsub(/(&(.+?);)/) { symbol = $2 # remove the 0-paddng from unicode integers - if symbol =~ /#(.+)/ - symbol = "##{$1.to_i.to_s}" + case symbol + when /^#x([0-9a-fA-F]+)$/ + symbol = $1.to_i(16).to_s + when /^#(\d+)$/ + symbol = $1.to_i.to_s end # output the symbol's irc-translated character, or a * if it's unknown - UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [$0.to_i].pack("U") : '*') + UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [symbol.to_i].pack("U") : '*') } end end @@ -410,7 +496,11 @@ module ::Irc # HTML first par grabber without hpricot def Utils.ircify_first_html_par_woh(xml_org, opts={}) - xml = xml_org.gsub(//m, '').gsub(/]*)?>.*?<\/script>/im, "").gsub(/]*)?>.*?<\/style>/im, "") + xml = xml_org.gsub(//m, + "").gsub(/]*)?>.*?<\/script>/im, + "").gsub(/]*)?>.*?<\/style>/im, + "").gsub(/]*)?>.*?<\/select>/im, + "") strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) @@ -505,9 +595,9 @@ module ::Irc when Net::HTTPResponse Utils.get_resp_html_info(doc, opts) when URI - ret = Hash.new + ret = DataStream.new @@bot.httputil.get_response(doc) { |resp| - ret = Utils.get_resp_html_info(resp, opts) + ret.replace Utils.get_resp_html_info(resp, opts) } return ret else @@ -521,10 +611,13 @@ module ::Irc # This method extracts title, content (first par) and extra # information from the given Net::HTTPResponse _resp_. # - # Currently, the only accepted option (in _opts_) is + # Currently, the only accepted options (in _opts_) are # uri_fragment:: the URI fragment of the original request + # full_body:: get the whole body instead of + # @@bot.config['http.info_bytes'] bytes only # - # Returns a Hash with the following keys: + # Returns a DataStream with the following keys: + # text:: the (partial) body # title:: the title of the document (if any) # content:: the first paragraph of the document (if any) # headers:: @@ -533,17 +626,21 @@ module ::Irc # header fields, and whose values are Arrays. # def Utils.get_resp_html_info(resp, opts={}) - ret = Hash.new case resp when Net::HTTPSuccess + loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil + if loc and loc.fragment and not loc.fragment.empty? + opts[:uri_fragment] ||= loc.fragment + end + ret = DataStream.new(opts.dup) ret[:headers] = resp.to_hash + ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes']) - partial = resp.partial_body(@@bot.config['http.info_bytes']) - if resp['content-type'] =~ /^text\/|(?:x|ht)ml/ - loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil - if loc and loc.fragment and not loc.fragment.empty? - opts[:uri_fragment] ||= loc.fragment - end + filtered = Utils.try_htmlinfo_filters(ret) + + if filtered + return filtered + elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/ ret.merge!(Utils.get_string_html_info(partial, opts)) end return ret @@ -552,6 +649,40 @@ module ::Irc end end + # This method runs an appropriately-crafted DataStream _ds_ through the + # filters in the :htmlinfo filter group, in order. If one of the filters + # returns non-nil, its results are merged in _ds_ and returned. Otherwise + # nil is returned. + # + # The input DataStream should have the downloaded HTML as primary key + # (:text) and possibly a :headers key holding the resonse headers. + # + def Utils.try_htmlinfo_filters(ds) + filters = @@bot.filter_names(:htmlinfo) + return nil if filters.empty? + cur = nil + # TODO filter priority + filters.each { |n| + debug "testing htmlinfo filter #{n}" + cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds) + debug "returned #{cur.pretty_inspect}" + break if cur + } + return ds.merge(cur) if cur + end + + # HTML info filters often need to check if the webpage location + # of a passed DataStream _ds_ matches a given Regexp. + def Utils.check_location(ds, rx) + debug ds[:headers] + if h = ds[:headers] + loc = [h['x-rbot-location'],h['location']].flatten.grep(rx) + end + loc ||= [] + debug loc + return loc.empty? ? nil : loc + end + # This method extracts title and content (first par) # from the given HTML or XML document _text_, using # standard methods (String#ircify_html_title, @@ -561,11 +692,19 @@ module ::Irc # uri_fragment:: the URI fragment of the original request # def Utils.get_string_html_info(text, opts={}) + debug "getting string html info" txt = text.dup title = txt.ircify_html_title + debug opts if frag = opts[:uri_fragment] and not frag.empty? - fragreg = /.*?]*name=["']?#{frag}["']?.*?>/im - txt.sub!(fragreg,'') + fragreg = /]+\s+)?(?:name|id)=["']?#{frag}["']?[^>]*>/im + debug fragreg + debug txt + if txt.match(fragreg) + # grab the post-match + txt = $' + end + debug txt end c_opts = opts.dup c_opts[:strip] ||= title @@ -606,6 +745,20 @@ module ::Irc return retval end + # Returns a comma separated list except for the last element + # which is joined in with specified conjunction + # + def Utils.comma_list(words, options={}) + defaults = { :join_with => ", ", :join_last_with => _(" and ") } + opts = defaults.merge(options) + + if words.size < 2 + words.last + else + [words[0..-2].join(opts[:join_with]), words.last].join(opts[:join_last_with]) + end + end + end end