require 'tempfile'
require 'set'
+# Try to load htmlentities, fall back to an HTML escape table.
begin
require 'htmlentities'
- $we_have_html_entities_decoder = true
rescue LoadError
gems = nil
begin
if gems
retry
else
- $we_have_html_entities_decoder = false
module ::Irc
module Utils
UNESCAPE_TABLE = {
end
end
+begin
+ require 'htmlentities'
+rescue LoadError
+ gems = nil
+ begin
+ gems = require 'rubygems'
+ rescue LoadError
+ gems = false
+ end
+ if gems
+ retry
+ else
+ module ::Irc
+ module Utils
+ # Some regular expressions to manage HTML data
+
+ # Title
+ TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
+
+ # H1, H2, etc
+ HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
+ # A paragraph
+ PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
+ # to mark actual text
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # At worst, we can try stuff which is comprised between two <br>
+ AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
+ end
+ end
+ end
+end
module ::Irc
- # miscellaneous useful functions
+ # Miscellaneous useful functions
module Utils
@@bot = nil unless defined? @@bot
@@safe_save_dir = nil unless defined?(@@safe_save_dir)
+ # The bot instance
def Utils.bot
@@bot
end
+ # Set up some Utils routines which depend on the associated bot.
def Utils.bot=(b)
debug "initializing utils"
@@bot = b
end
+ # Seconds per minute
SEC_PER_MIN = 60
+ # Seconds per hour
SEC_PER_HR = SEC_PER_MIN * 60
+ # Seconds per day
SEC_PER_DAY = SEC_PER_HR * 24
+ # Seconds per (30-day) month
SEC_PER_MNTH = SEC_PER_DAY * 30
+ # Second per (30*12 = 360 day) year
SEC_PER_YR = SEC_PER_MNTH * 12
+ # Auxiliary method needed by Utils.secs_to_string
def Utils.secs_to_string_case(array, var, string, plural)
case var
when 1
end
end
- # turn a number of seconds into a human readable string, e.g
- # 2 days, 3 hours, 18 minutes, 10 seconds
+ # Turn a number of seconds into a human readable string, e.g
+ # 2 days, 3 hours, 18 minutes and 10 seconds
def Utils.secs_to_string(secs)
ret = []
years, secs = secs.divmod SEC_PER_YR
- secs_to_string_case(ret, years, "year", "years") if years > 0
+ secs_to_string_case(ret, years, _("year"), _("years")) if years > 0
months, secs = secs.divmod SEC_PER_MNTH
- secs_to_string_case(ret, months, "month", "months") if months > 0
+ secs_to_string_case(ret, months, _("month"), _("months")) if months > 0
days, secs = secs.divmod SEC_PER_DAY
- secs_to_string_case(ret, days, "day", "days") if days > 0
+ secs_to_string_case(ret, days, _("day"), _("days")) if days > 0
hours, secs = secs.divmod SEC_PER_HR
- secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
+ secs_to_string_case(ret, hours, _("hour"), _("hours")) if hours > 0
mins, secs = secs.divmod SEC_PER_MIN
- secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
+ secs_to_string_case(ret, mins, _("minute"), _("minutes")) if mins > 0
secs = secs.to_i
- secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
+ secs_to_string_case(ret, secs, _("second"), _("seconds")) if secs > 0 or ret.empty?
case ret.length
when 0
raise "Empty ret array!"
when 1
return ret.to_s
else
- return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
+ return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
end
end
+ # Execute an external program, returning a String obtained by redirecting
+ # the program's standards errors and output
+ #
def Utils.safe_exec(command, *args)
- IO.popen("-") {|p|
- if(p)
+ IO.popen("-") { |p|
+ if p
return p.readlines.join("\n")
else
begin
$stderr.reopen($stdout)
exec(command, *args)
rescue Exception => e
- puts "exec of #{command} led to exception: #{e.inspect}"
+ puts "exec of #{command} led to exception: #{e.pretty_inspect}"
Kernel::exit! 0
end
puts "exec of #{command} failed"
end
+ # Safely (atomically) save to _file_, by passing a tempfile to the block
+ # and then moving the tempfile to its final location when done.
+ #
+ # call-seq: Utils.safe_save(file, &block)
+ #
def Utils.safe_save(file)
raise 'No safe save directory defined!' if @@safe_save_dir.nil?
basename = File.basename(file)
end
+ # Decode HTML entities in the String _str_, using HTMLEntities if the
+ # package was found, or UNESCAPE_TABLE otherwise.
+ #
def Utils.decode_html_entities(str)
- if $we_have_html_entities_decoder
+ if defined? ::HTMLEntities
return HTMLEntities.decode_entities(str)
else
str.gsub(/(&(.+?);)/) {
end
end
- HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
- PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
- # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
- # to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
- # At worst, we can try stuff which is comprised between two <br>
- AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
-
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
# If possible, grab the one after the first heading
#
# It is possible to pass some options to determine how the stripping
# occurs. Currently supported options are
- # * :strip => Regex or String to strip at the beginning of the obtained
- # text
- # * :min_spaces => Minimum number of spaces a paragraph should have
+ # strip:: Regex or String to strip at the beginning of the obtained
+ # text
+ # min_spaces:: minimum number of spaces a paragraph should have
#
def Utils.ircify_first_html_par(xml_org, opts={})
+ if defined? ::Hpricot
+ Utils.ircify_first_html_par_wh(xml_org, opts)
+ else
+ Utils.ircify_first_html_par_woh(xml_org, opts)
+ end
+ end
+
+ # HTML first par grabber using hpricot
+ def Utils.ircify_first_html_par_wh(xml_org, opts={})
+ doc = Hpricot(xml_org)
+
+ # Strip styles and scripts
+ (doc/"style|script").remove
+
+ debug doc.inspect
+
+ strip = opts[:strip]
+ strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+ min_spaces = opts[:min_spaces] || 8
+ min_spaces = 0 if min_spaces < 0
+
+ txt = String.new
+
+ pre_h = pars = by_span = nil
+
+ while true
+ debug "Minimum number of spaces: #{min_spaces}"
+
+ # Initial attempt: <p> that follows <h\d>
+ if pre_h.nil?
+ pre_h = Hpricot::Elements[]
+ found_h = false
+ doc.root.search("*") { |e|
+ case e.pathname
+ when /^h\d/
+ found_h = true
+ when 'p'
+ pre_h << e if found_h
+ end
+ }
+ debug "Hx: found: #{pre_h.pretty_inspect}"
+ end
+
+ pre_h.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Second natural attempt: just get any <p>
+ pars = doc/"p" if pars.nil?
+ debug "par: found: #{pars.pretty_inspect}"
+ pars.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Nothing yet ... let's get drastic: we look for non-par elements too,
+ # but only for those that match something that we know is likely to
+ # contain text
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or
+ # 'message' or 'text' in their class to mark actual text. Since we want
+ # the class match to be partial and case insensitive, we collect
+ # the common elements that may have this class and then filter out those
+ # we don't need
+ if by_span.nil?
+ by_span = Hpricot::Elements[]
+ doc.root.each("*") { |el|
+ by_span.push el if el.pathname =~ /^(?:div|span|td|tr|tbody|table)$/ and el[:class] =~ /body|message|text/i
+ }
+ debug "other \#1: found: #{by_span.pretty_inspect}"
+ end
+
+ by_span.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # At worst, we can try stuff which is comprised between two <br>
+ # TODO
+
+ debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
+ return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
+ min_spaces /= 2
+ end
+ end
+
+ # HTML first par grabber without hpricot
+ def Utils.ircify_first_html_par_woh(xml_org, opts={})
xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
strip = opts[:strip]
debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
min_spaces /= 2
end
end
+ # This method extracts title, content (first par) and extra
+ # information from the given document _doc_.
+ #
+ # _doc_ can be an URI, a Net::HTTPResponse or a String.
+ #
+ # If _doc_ is a String, only title and content information
+ # are retrieved (if possible), using standard methods.
+ #
+ # If _doc_ is an URI or a Net::HTTPResponse, additional
+ # information is retrieved, and special title/summary
+ # extraction routines are used if possible.
+ #
+ def Utils.get_html_info(doc, opts={})
+ case doc
+ when String
+ Utils.get_string_html_info(doc, opts)
+ when Net::HTTPResponse
+ Utils.get_resp_html_info(doc, opts)
+ when URI
+ if doc.fragment and not doc.fragment.empty?
+ opts[:uri_fragment] ||= doc.fragment
+ end
+ ret = Hash.new
+ @@bot.httputil.get_response(doc) { |resp|
+ ret = Utils.get_resp_html_info(resp, opts)
+ }
+ return ret
+ else
+ raise
+ end
+ end
+
+ class ::UrlLinkError < RuntimeError
+ end
+
+ # This method extracts title, content (first par) and extra
+ # information from the given Net::HTTPResponse _resp_.
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ # Returns a Hash with the following keys:
+ # title:: the title of the document (if any)
+ # content:: the first paragraph of the document (if any)
+ # headers::
+ # the headers of the Net::HTTPResponse. The value is
+ # a Hash whose keys are lowercase forms of the HTTP
+ # header fields, and whose values are Arrays.
+ #
+ def Utils.get_resp_html_info(resp, opts={})
+ ret = Hash.new
+ case resp
+ when Net::HTTPSuccess
+ ret[:headers] = resp.to_hash
+
+ if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+ partial = resp.partial_body(@@bot.config['http.info_bytes'])
+ ret.merge!(Utils.get_string_html_info(partial, opts))
+ end
+ return ret
+ else
+ raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+ end
+ end
+
+ # This method extracts title and content (first par)
+ # from the given HTML or XML document _text_, using
+ # standard methods (String#ircify_html_title,
+ # Utils.ircify_first_html_par)
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ def Utils.get_string_html_info(text, opts={})
+ txt = text.dup
+ title = txt.ircify_html_title
+ if frag = opts[:uri_fragment] and not frag.empty?
+ fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
+ txt.sub!(fragreg,'')
+ end
+ c_opts = opts.dup
+ c_opts[:strip] ||= title
+ content = Utils.ircify_first_html_par(txt, c_opts)
+ content = nil if content.empty?
+ return {:title => title, :content => content}
+ end
+
# Get the first pars of the first _count_ _urls_.
# The pages are downloaded using the bot httputil service.
# Returns an array of the first paragraphs fetched.
url = urls.shift
idx += 1
- # FIXME what happens if some big file is returned? We should share
- # code with the url plugin to only retrieve partial file content!
- xml = self.bot.httputil.get(url)
- if xml.nil?
- debug "Unable to retrieve #{url}"
- next
- end
- par = Utils.ircify_first_html_par(xml, opts)
- if par.empty?
- debug "No first par found\n#{xml}"
- # FIXME only do this if the 'url' plugin is loaded
- # TODO even better, put the code here
- # par = @bot.plugins['url'].get_title_from_html(xml)
- if par.empty?
- retval.push(nil)
- next
+ begin
+ info = Utils.get_html_info(URI.parse(url), opts)
+
+ par = info[:content]
+ retval.push(par)
+
+ if par
+ msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+ count -=1
end
+ rescue
+ debug "Unable to retrieve #{url}: #{$!}"
+ next
end
- msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
- count -=1
- retval.push(par)
end
return retval
end
-
end
end
-Irc::Utils.bot = Irc::Plugins.manager.bot
+Irc::Utils.bot = Irc::Bot::Plugins.manager.bot