# Author:: Tom Gilbert <tom@linuxbrit.co.uk>
# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
#
-# Copyright:: (C) 2002-2006 Tom Gilbert
-# Copyright:: (C) 2007 Giuseppe Bilotta
-#
# TODO some of these Utils should be rewritten as extensions to the approriate
# standard Ruby classes and accordingly be moved to extends.rb
begin
require 'htmlentities'
rescue LoadError
- gems = nil
- begin
- gems = require 'rubygems'
- rescue LoadError
- gems = false
- end
- if gems
- retry
- else
module ::Irc
module Utils
UNESCAPE_TABLE = {
'raquo' => '»',
'quot' => '"',
'apos' => '\'',
+ 'deg' => '°',
'micro' => 'µ',
'copy' => '©',
'trade' => '™',
'gt' => '>',
'hellip' => '…',
'nbsp' => ' ',
+ 'ndash' => '–',
'Agrave' => 'À',
'Aacute' => 'Á',
'Acirc' => 'Â',
}
end
end
- end
end
begin
end
end
rescue LoadError
- gems = nil
- begin
- gems = require 'rubygems'
- rescue LoadError
- gems = false
- end
- if gems
- retry
- else
module ::Irc
module Utils
# Some regular expressions to manage HTML data
# Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
# to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
# At worst, we can try stuff which is comprised between two <br>
AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
end
end
- end
end
module ::Irc
def Utils.bot=(b)
debug "initializing utils"
@@bot = b
- @@safe_save_dir = "#{@@bot.botclass}/safe_save"
+ @@safe_save_dir = @@bot.path('safe_save')
end
SEC_PER_HR = SEC_PER_MIN * 60
# Seconds per day
SEC_PER_DAY = SEC_PER_HR * 24
+ # Seconds per week
+ SEC_PER_WK = SEC_PER_DAY * 7
# Seconds per (30-day) month
SEC_PER_MNTH = SEC_PER_DAY * 30
- # Second per (30*12 = 360 day) year
- SEC_PER_YR = SEC_PER_MNTH * 12
+ # Second per (non-leap) year
+ SEC_PER_YR = SEC_PER_DAY * 365
# Auxiliary method needed by Utils.secs_to_string
def Utils.secs_to_string_case(array, var, string, plural)
end
end
+ # Turn a number of seconds into a hours:minutes:seconds e.g.
+ # 3:18:10 or 5'12" or 7s
+ #
+ def Utils.secs_to_short(seconds)
+ secs = seconds.to_i # make sure it's an integer
+ mins, secs = secs.divmod 60
+ hours, mins = mins.divmod 60
+ if hours > 0
+ return ("%s:%s:%s" % [hours, mins, secs])
+ elsif mins > 0
+ return ("%s'%s\"" % [mins, secs])
+ else
+ return ("%ss" % [secs])
+ end
+ end
+
+ # Returns human readable time.
+ # Like: 5 days ago
+ # about one hour ago
+ # options
+ # :start_date, sets the time to measure against, defaults to now
+ # :date_format, used with <tt>to_formatted_s<tt>, default to :default
+ def Utils.timeago(time, options = {})
+ start_date = options.delete(:start_date) || Time.new
+ date_format = options.delete(:date_format) || "%x"
+ delta = (start_date - time).round
+ if delta.abs < 2
+ _("right now")
+ else
+ distance = Utils.age_string(delta)
+ if delta < 0
+ _("%{d} from now") % {:d => distance}
+ else
+ _("%{d} ago") % {:d => distance}
+ end
+ end
+ end
+
+ # Converts age in seconds to "nn units". Inspired by previous attempts
+ # but also gitweb's age_string() sub
+ def Utils.age_string(secs)
+ case
+ when secs < 0
+ Utils.age_string(-secs)
+ when secs > 2*SEC_PER_YR
+ _("%{m} years") % { :m => secs/SEC_PER_YR }
+ when secs > 2*SEC_PER_MNTH
+ _("%{m} months") % { :m => secs/SEC_PER_MNTH }
+ when secs > 2*SEC_PER_WK
+ _("%{m} weeks") % { :m => secs/SEC_PER_WK }
+ when secs > 2*SEC_PER_DAY
+ _("%{m} days") % { :m => secs/SEC_PER_DAY }
+ when secs > 2*SEC_PER_HR
+ _("%{m} hours") % { :m => secs/SEC_PER_HR }
+ when (20*SEC_PER_MIN..40*SEC_PER_MIN).include?(secs)
+ _("half an hour")
+ when (50*SEC_PER_MIN..70*SEC_PER_MIN).include?(secs)
+ # _("about one hour")
+ _("an hour")
+ when (80*SEC_PER_MIN..100*SEC_PER_MIN).include?(secs)
+ _("an hour and a half")
+ when secs > 2*SEC_PER_MIN
+ _("%{m} minutes") % { :m => secs/SEC_PER_MIN }
+ when secs > 1
+ _("%{m} seconds") % { :m => secs }
+ else
+ _("one second")
+ end
+ end
# Execute an external program, returning a String obtained by redirecting
- # the program's standards errors and output
+ # the program's standards errors and output
#
+ # TODO: find a way to expose some common errors (e.g. Errno::NOENT)
+ # to the caller
def Utils.safe_exec(command, *args)
- IO.popen("-") { |p|
+ output = IO.popen("-") { |p|
if p
- return p.readlines.join("\n")
+ break p.readlines.join("\n")
else
begin
$stderr.reopen($stdout)
exec(command, *args)
rescue Exception => e
- puts "exec of #{command} led to exception: #{e.pretty_inspect}"
- Kernel::exit! 0
+ puts "exception #{e.pretty_inspect} trying to run #{command}"
+ Kernel::exit! 1
end
puts "exec of #{command} failed"
- Kernel::exit! 0
+ Kernel::exit! 1
end
}
+ raise "safe execution of #{command} returned #{$?}" unless $?.success?
+ return output
end
+ # Try executing an external program, returning true if the run was successful
+ # and false otherwise
+ def Utils.try_exec(command, *args)
+ IO.popen("-") { |p|
+ if p.nil?
+ begin
+ $stderr.reopen($stdout)
+ exec(command, *args)
+ rescue Exception => e
+ Kernel::exit! 1
+ end
+ Kernel::exit! 1
+ else
+ debug p.readlines
+ end
+ }
+ debug $?
+ return $?.success?
+ end
# Safely (atomically) save to _file_, by passing a tempfile to the block
# and then moving the tempfile to its final location when done.
# Decode HTML entities in the String _str_, using HTMLEntities if the
# package was found, or UNESCAPE_TABLE otherwise.
#
- def Utils.decode_html_entities(str)
- if defined? ::HTMLEntities
- return HTMLEntities.decode_entities(str)
+
+ if defined? ::HTMLEntities
+ if ::HTMLEntities.respond_to? :decode_entities
+ def Utils.decode_html_entities(str)
+ return HTMLEntities.decode_entities(str)
+ end
else
- str.gsub(/(&(.+?);)/) {
+ @@html_entities = HTMLEntities.new
+ def Utils.decode_html_entities(str)
+ return @@html_entities.decode str
+ end
+ end
+ else
+ def Utils.decode_html_entities(str)
+ return str.gsub(/(&(.+?);)/) {
symbol = $2
# remove the 0-paddng from unicode integers
- if symbol =~ /^#(\d+)$/
+ case symbol
+ when /^#x([0-9a-fA-F]+)$/
+ symbol = $1.to_i(16).to_s
+ when /^#(\d+)$/
symbol = $1.to_i.to_s
end
# HTML first par grabber without hpricot
def Utils.ircify_first_html_par_woh(xml_org, opts={})
- xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+ xml = xml_org.gsub(/<!--.*?-->/m,
+ "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im,
+ "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im,
+ "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im,
+ "")
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
when Net::HTTPResponse
Utils.get_resp_html_info(doc, opts)
when URI
- ret = Hash.new
+ ret = DataStream.new
@@bot.httputil.get_response(doc) { |resp|
- ret = Utils.get_resp_html_info(resp, opts)
+ ret.replace Utils.get_resp_html_info(resp, opts)
}
return ret
else
# This method extracts title, content (first par) and extra
# information from the given Net::HTTPResponse _resp_.
#
- # Currently, the only accepted option (in _opts_) is
+ # Currently, the only accepted options (in _opts_) are
# uri_fragment:: the URI fragment of the original request
+ # full_body:: get the whole body instead of
+ # @@bot.config['http.info_bytes'] bytes only
#
- # Returns a Hash with the following keys:
+ # Returns a DataStream with the following keys:
+ # text:: the (partial) body
# title:: the title of the document (if any)
# content:: the first paragraph of the document (if any)
# headers::
# header fields, and whose values are Arrays.
#
def Utils.get_resp_html_info(resp, opts={})
- ret = Hash.new
case resp
when Net::HTTPSuccess
+ loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
+ if loc and loc.fragment and not loc.fragment.empty?
+ opts[:uri_fragment] ||= loc.fragment
+ end
+ ret = DataStream.new(opts.dup)
ret[:headers] = resp.to_hash
+ ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes'])
- partial = resp.partial_body(@@bot.config['http.info_bytes'])
- if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
- loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
- if loc and loc.fragment and not loc.fragment.empty?
- opts[:uri_fragment] ||= loc.fragment
- end
+ filtered = Utils.try_htmlinfo_filters(ret)
+
+ if filtered
+ return filtered
+ elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/
ret.merge!(Utils.get_string_html_info(partial, opts))
end
return ret
end
end
+ # This method runs an appropriately-crafted DataStream _ds_ through the
+ # filters in the :htmlinfo filter group, in order. If one of the filters
+ # returns non-nil, its results are merged in _ds_ and returned. Otherwise
+ # nil is returned.
+ #
+ # The input DataStream should have the downloaded HTML as primary key
+ # (:text) and possibly a :headers key holding the resonse headers.
+ #
+ def Utils.try_htmlinfo_filters(ds)
+ filters = @@bot.filter_names(:htmlinfo)
+ return nil if filters.empty?
+ cur = nil
+ # TODO filter priority
+ filters.each { |n|
+ debug "testing htmlinfo filter #{n}"
+ cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
+ debug "returned #{cur.pretty_inspect}"
+ break if cur
+ }
+ return ds.merge(cur) if cur
+ end
+
+ # HTML info filters often need to check if the webpage location
+ # of a passed DataStream _ds_ matches a given Regexp.
+ def Utils.check_location(ds, rx)
+ debug ds[:headers]
+ if h = ds[:headers]
+ loc = [h['x-rbot-location'],h['location']].flatten.grep(rx)
+ end
+ loc ||= []
+ debug loc
+ return loc.empty? ? nil : loc
+ end
+
# This method extracts title and content (first par)
# from the given HTML or XML document _text_, using
# standard methods (String#ircify_html_title,
# uri_fragment:: the URI fragment of the original request
#
def Utils.get_string_html_info(text, opts={})
+ debug "getting string html info"
txt = text.dup
title = txt.ircify_html_title
+ debug opts
if frag = opts[:uri_fragment] and not frag.empty?
- fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
- txt.sub!(fragreg,'')
+ fragreg = /<a\s+(?:[^>]+\s+)?(?:name|id)=["']?#{frag}["']?[^>]*>/im
+ debug fragreg
+ debug txt
+ if txt.match(fragreg)
+ # grab the post-match
+ txt = $'
+ end
+ debug txt
end
c_opts = opts.dup
c_opts[:strip] ||= title
return retval
end
+ # Returns a comma separated list except for the last element
+ # which is joined in with specified conjunction
+ #
+ def Utils.comma_list(words, options={})
+ defaults = { :join_with => ", ", :join_last_with => _(" and ") }
+ opts = defaults.merge(options)
+
+ if words.size < 2
+ words.last
+ else
+ [words[0..-2].join(opts[:join_with]), words.last].join(opts[:join_last_with])
+ end
+ end
+
end
end