X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=lib%2Frbot%2Fcore%2Futils%2Futils.rb;h=7b316ffe28cd3df6ffc6ecd0d1707063142dd1bb;hb=bf9734ff89a238c5a63015b68eabd8d0ef9d1308;hp=65ba66514c2b9ba655fc0fe112aaebcad84cee72;hpb=06ab3351b8b6bdc8628244959b03751acd61e2fa;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 65ba6651..7b316ffe 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -6,9 +6,6 @@ # Author:: Tom Gilbert # Author:: Giuseppe "Oblomov" Bilotta # -# Copyright:: (C) 2002-2006 Tom Gilbert -# Copyright:: (C) 2007 Giuseppe Bilotta -# # TODO some of these Utils should be rewritten as extensions to the approriate # standard Ruby classes and accordingly be moved to extends.rb @@ -19,305 +16,109 @@ require 'set' begin require 'htmlentities' rescue LoadError - gems = nil - begin - gems = require 'rubygems' - rescue LoadError - gems = false - end - if gems - retry - else module ::Irc module Utils UNESCAPE_TABLE = { - 'laquo' => '<<', - 'raquo' => '>>', + 'laquo' => '«', + 'raquo' => '»', 'quot' => '"', 'apos' => '\'', - 'micro' => 'u', - 'copy' => '(c)', - 'trade' => '(tm)', - 'reg' => '(R)', - '#174' => '(R)', - '#8220' => '"', - '#8221' => '"', - '#8212' => '--', - '#39' => '\'', + 'deg' => '°', + 'micro' => 'µ', + 'copy' => '©', + 'trade' => '™', + 'reg' => '®', 'amp' => '&', 'lt' => '<', 'gt' => '>', - 'hellip' => '...', - 'nbsp' => ' ', -=begin - # extras codes, for future use... - 'zwnj' => '‌', - 'aring' => '\xe5', - 'gt' => '>', - 'yen' => '\xa5', - 'ograve' => '\xf2', - 'Chi' => 'Χ', - 'bull' => '•', - 'Egrave' => '\xc8', - 'Ntilde' => '\xd1', - 'upsih' => 'ϒ', - 'Yacute' => '\xdd', - 'asymp' => '≈', - 'radic' => '√', - 'otimes' => '⊗', - 'nabla' => '∇', - 'aelig' => '\xe6', - 'oelig' => 'œ', - 'equiv' => '≡', - 'Psi' => 'Ψ', - 'auml' => '\xe4', - 'circ' => 'ˆ', - 'Acirc' => '\xc2', - 'Epsilon' => 'Ε', - 'Yuml' => 'Ÿ', - 'Eta' => 'Η', - 'Icirc' => '\xce', - 'Upsilon' => 'Υ', - 'ndash' => '–', - 'there4' => '∴', - 'Prime' => '″', - 'prime' => '′', - 'psi' => 'ψ', - 'Kappa' => 'Κ', - 'rsaquo' => '›', - 'Tau' => 'Τ', - 'darr' => '↓', - 'ocirc' => '\xf4', - 'lrm' => '‎', - 'zwj' => '‍', - 'cedil' => '\xb8', - 'Ecirc' => '\xca', - 'not' => '\xac', - 'AElig' => '\xc6', - 'oslash' => '\xf8', - 'acute' => '\xb4', - 'lceil' => '⌈', - 'shy' => '\xad', - 'rdquo' => '”', - 'ge' => '≥', - 'Igrave' => '\xcc', - 'Ograve' => '\xd2', - 'euro' => '€', - 'dArr' => '⇓', - 'sdot' => '⋅', - 'nbsp' => '\xa0', - 'lfloor' => '⌊', - 'lArr' => '⇐', - 'Auml' => '\xc4', - 'larr' => '←', - 'Atilde' => '\xc3', - 'Otilde' => '\xd5', - 'szlig' => '\xdf', - 'clubs' => '♣', - 'diams' => '♦', - 'agrave' => '\xe0', - 'Ocirc' => '\xd4', - 'Iota' => 'Ι', - 'Theta' => 'Θ', - 'Pi' => 'Π', - 'OElig' => 'Œ', - 'Scaron' => 'Š', - 'frac14' => '\xbc', - 'egrave' => '\xe8', - 'sub' => '⊂', - 'iexcl' => '\xa1', - 'frac12' => '\xbd', - 'sbquo' => '‚', - 'ordf' => '\xaa', - 'sum' => '∑', - 'prop' => '∝', - 'Uuml' => '\xdc', - 'ntilde' => '\xf1', - 'sup' => '⊃', - 'theta' => 'θ', - 'prod' => '∏', - 'nsub' => '⊄', - 'hArr' => '⇔', - 'rlm' => '‏', - 'THORN' => '\xde', - 'infin' => '∞', - 'yuml' => '\xff', - 'Mu' => 'Μ', - 'le' => '≤', - 'Eacute' => '\xc9', - 'thinsp' => ' ', - 'ecirc' => '\xea', - 'bdquo' => '„', - 'Sigma' => 'Σ', - 'fnof' => 'ƒ', - 'Aring' => '\xc5', - 'tilde' => '˜', - 'frac34' => '\xbe', - 'emsp' => ' ', - 'mdash' => '—', - 'uarr' => '↑', - 'permil' => '‰', - 'Ugrave' => '\xd9', - 'rarr' => '→', - 'Agrave' => '\xc0', - 'chi' => 'χ', - 'forall' => '∀', - 'eth' => '\xf0', - 'rceil' => '⌉', - 'iuml' => '\xef', - 'gamma' => 'γ', - 'lambda' => 'λ', - 'harr' => '↔', - 'rang' => '〉', - 'xi' => 'ξ', - 'dagger' => '†', - 'divide' => '\xf7', - 'Ouml' => '\xd6', - 'image' => 'ℑ', - 'alefsym' => 'ℵ', - 'igrave' => '\xec', - 'otilde' => '\xf5', - 'Oacute' => '\xd3', - 'sube' => '⊆', - 'alpha' => 'α', - 'frasl' => '⁄', - 'ETH' => '\xd0', - 'lowast' => '∗', - 'Nu' => 'Ν', - 'plusmn' => '\xb1', - 'Euml' => '\xcb', - 'real' => 'ℜ', - 'sup1' => '\xb9', - 'sup2' => '\xb2', - 'sup3' => '\xb3', - 'Oslash' => '\xd8', - 'Aacute' => '\xc1', - 'cent' => '\xa2', - 'oline' => '‾', - 'Beta' => 'Β', - 'perp' => '⊥', - 'Delta' => 'Δ', - 'loz' => '◊', - 'pi' => 'π', - 'iota' => 'ι', - 'empty' => '∅', - 'euml' => '\xeb', - 'brvbar' => '\xa6', - 'iacute' => '\xed', - 'para' => '\xb6', - 'micro' => '\xb5', - 'cup' => '∪', - 'weierp' => '℘', - 'uuml' => '\xfc', - 'part' => '∂', - 'icirc' => '\xee', - 'delta' => 'δ', - 'omicron' => 'ο', - 'upsilon' => 'υ', - 'Iuml' => '\xcf', - 'Lambda' => 'Λ', - 'Xi' => 'Ξ', - 'kappa' => 'κ', - 'ccedil' => '\xe7', - 'Ucirc' => '\xdb', - 'cap' => '∩', - 'mu' => 'μ', - 'scaron' => 'š', - 'lsquo' => '‘', - 'isin' => '∈', - 'Zeta' => 'Ζ', - 'supe' => '⊇', - 'deg' => '\xb0', - 'and' => '∧', - 'tau' => 'τ', - 'pound' => '\xa3', - 'hellip' => '…', - 'curren' => '\xa4', - 'int' => '∫', - 'ucirc' => '\xfb', - 'rfloor' => '⌋', - 'ensp' => ' ', - 'crarr' => '↵', - 'ugrave' => '\xf9', - 'notin' => '∉', - 'exist' => '∃', - 'uArr' => '⇑', - 'cong' => '≅', - 'Dagger' => '‡', - 'oplus' => '⊕', - 'times' => '\xd7', - 'atilde' => '\xe3', - 'piv' => 'ϖ', - 'ni' => '∋', - 'Phi' => 'Φ', - 'lsaquo' => '‹', - 'Uacute' => '\xda', - 'Omicron' => 'Ο', - 'ang' => '∠', - 'ne' => '≠', - 'iquest' => '\xbf', - 'eta' => 'η', - 'yacute' => '\xfd', - 'Rho' => 'Ρ', - 'uacute' => '\xfa', - 'Alpha' => 'Α', - 'zeta' => 'ζ', - 'Omega' => 'Ω', - 'nu' => 'ν', - 'sim' => '∼', - 'sect' => '\xa7', - 'phi' => 'φ', - 'sigmaf' => 'ς', - 'macr' => '\xaf', - 'minus' => '−', - 'Ccedil' => '\xc7', - 'ordm' => '\xba', - 'epsilon' => 'ε', - 'beta' => 'β', - 'rArr' => '⇒', - 'rho' => 'ρ', - 'aacute' => '\xe1', - 'eacute' => '\xe9', - 'omega' => 'ω', - 'middot' => '\xb7', - 'Gamma' => 'Γ', - 'Iacute' => '\xcd', - 'lang' => '〈', - 'spades' => '♠', - 'rsquo' => '’', - 'uml' => '\xa8', - 'thorn' => '\xfe', - 'ouml' => '\xf6', - 'thetasym' => 'ϑ', - 'or' => '∨', - 'raquo' => '\xbb', - 'acirc' => '\xe2', - 'ldquo' => '“', - 'hearts' => '♥', - 'sigma' => 'σ', - 'oacute' => '\xf3', -=end + 'hellip' => '…', + 'nbsp' => ' ', + 'ndash' => '–', + 'Agrave' => 'À', + 'Aacute' => 'Á', + 'Acirc' => 'Â', + 'Atilde' => 'Ã', + 'Auml' => 'Ä', + 'Aring' => 'Å', + 'AElig' => 'Æ', + 'OElig' => 'Œ', + 'Ccedil' => 'Ç', + 'Egrave' => 'È', + 'Eacute' => 'É', + 'Ecirc' => 'Ê', + 'Euml' => 'Ë', + 'Igrave' => 'Ì', + 'Iacute' => 'Í', + 'Icirc' => 'Î', + 'Iuml' => 'Ï', + 'ETH' => 'Ð', + 'Ntilde' => 'Ñ', + 'Ograve' => 'Ò', + 'Oacute' => 'Ó', + 'Ocirc' => 'Ô', + 'Otilde' => 'Õ', + 'Ouml' => 'Ö', + 'Oslash' => 'Ø', + 'Ugrave' => 'Ù', + 'Uacute' => 'Ú', + 'Ucirc' => 'Û', + 'Uuml' => 'Ü', + 'Yacute' => 'Ý', + 'THORN' => 'Þ', + 'szlig' => 'ß', + 'agrave' => 'à', + 'aacute' => 'á', + 'acirc' => 'â', + 'atilde' => 'ã', + 'auml' => 'ä', + 'aring' => 'Ã¥', + 'aelig' => 'æ', + 'oelig' => 'œ', + 'ccedil' => 'ç', + 'egrave' => 'è', + 'eacute' => 'é', + 'ecirc' => 'ê', + 'euml' => 'ë', + 'igrave' => 'ì', + 'iacute' => 'í', + 'icirc' => 'î', + 'iuml' => 'ï', + 'eth' => 'ð', + 'ntilde' => 'ñ', + 'ograve' => 'ò', + 'oacute' => 'ó', + 'ocirc' => 'ô', + 'otilde' => 'õ', + 'ouml' => 'ö', + 'oslash' => 'ø', + 'ugrave' => 'ù', + 'uacute' => 'ú', + 'ucirc' => 'û', + 'uuml' => 'ü', + 'yacute' => 'ý', + 'thorn' => 'þ', + 'yuml' => 'ÿ' } end end - end end begin - require 'htmlentities' -rescue LoadError - gems = nil - begin - gems = require 'rubygems' - rescue LoadError - gems = false + require 'hpricot' + module ::Irc + module Utils + AFTER_PAR_PATH = /^(?:div|span)$/ + AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/ + AFTER_PAR_CLASS = /body|message|text/i + end end - if gems - retry - else +rescue LoadError module ::Irc module Utils - # define some regular expressions to be used for first_html_par + # Some regular expressions to manage HTML data + + # Title + TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im # H1, H2, etc HX_REGEX = /]*)?>(.*?)<\/h\1>/im @@ -326,39 +127,48 @@ rescue LoadError # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class # to mark actual text - AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # At worst, we can try stuff which is comprised between two
AFTER_PAR2_REGEX = /]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im end end - end end module ::Irc - # miscellaneous useful functions + # Miscellaneous useful functions module Utils @@bot = nil unless defined? @@bot @@safe_save_dir = nil unless defined?(@@safe_save_dir) + # The bot instance def Utils.bot @@bot end + # Set up some Utils routines which depend on the associated bot. def Utils.bot=(b) debug "initializing utils" @@bot = b - @@safe_save_dir = "#{@@bot.botclass}/safe_save" + @@safe_save_dir = @@bot.path('safe_save') end + # Seconds per minute SEC_PER_MIN = 60 + # Seconds per hour SEC_PER_HR = SEC_PER_MIN * 60 + # Seconds per day SEC_PER_DAY = SEC_PER_HR * 24 + # Seconds per week + SEC_PER_WK = SEC_PER_DAY * 7 + # Seconds per (30-day) month SEC_PER_MNTH = SEC_PER_DAY * 30 - SEC_PER_YR = SEC_PER_MNTH * 12 + # Second per (non-leap) year + SEC_PER_YR = SEC_PER_DAY * 365 + # Auxiliary method needed by Utils.secs_to_string def Utils.secs_to_string_case(array, var, string, plural) case var when 1 @@ -368,8 +178,8 @@ module ::Irc end end - # turn a number of seconds into a human readable string, e.g - # 2 days, 3 hours, 18 minutes, 10 seconds + # Turn a number of seconds into a human readable string, e.g + # 2 days, 3 hours, 18 minutes and 10 seconds def Utils.secs_to_string(secs) ret = [] years, secs = secs.divmod SEC_PER_YR @@ -394,26 +204,126 @@ module ::Irc end end + # Turn a number of seconds into a hours:minutes:seconds e.g. + # 3:18:10 or 5'12" or 7s + # + def Utils.secs_to_short(seconds) + secs = seconds.to_i # make sure it's an integer + mins, secs = secs.divmod 60 + hours, mins = mins.divmod 60 + if hours > 0 + return ("%s:%s:%s" % [hours, mins, secs]) + elsif mins > 0 + return ("%s'%s\"" % [mins, secs]) + else + return ("%ss" % [secs]) + end + end + + # Returns human readable time. + # Like: 5 days ago + # about one hour ago + # options + # :start_date, sets the time to measure against, defaults to now + # :date_format, used with to_formatted_s, default to :default + def Utils.timeago(time, options = {}) + start_date = options.delete(:start_date) || Time.new + date_format = options.delete(:date_format) || "%x" + delta = (start_date - time).round + if delta.abs < 2 + _("right now") + else + distance = Utils.age_string(delta) + if delta < 0 + _("%{d} from now") % {:d => distance} + else + _("%{d} ago") % {:d => distance} + end + end + end + + # Converts age in seconds to "nn units". Inspired by previous attempts + # but also gitweb's age_string() sub + def Utils.age_string(secs) + case + when secs < 0 + Utils.age_string(-secs) + when secs > 2*SEC_PER_YR + _("%{m} years") % { :m => secs/SEC_PER_YR } + when secs > 2*SEC_PER_MNTH + _("%{m} months") % { :m => secs/SEC_PER_MNTH } + when secs > 2*SEC_PER_WK + _("%{m} weeks") % { :m => secs/SEC_PER_WK } + when secs > 2*SEC_PER_DAY + _("%{m} days") % { :m => secs/SEC_PER_DAY } + when secs > 2*SEC_PER_HR + _("%{m} hours") % { :m => secs/SEC_PER_HR } + when (20*SEC_PER_MIN..40*SEC_PER_MIN).include?(secs) + _("half an hour") + when (50*SEC_PER_MIN..70*SEC_PER_MIN).include?(secs) + # _("about one hour") + _("an hour") + when (80*SEC_PER_MIN..100*SEC_PER_MIN).include?(secs) + _("an hour and a half") + when secs > 2*SEC_PER_MIN + _("%{m} minutes") % { :m => secs/SEC_PER_MIN } + when secs > 1 + _("%{m} seconds") % { :m => secs } + else + _("one second") + end + end + # Execute an external program, returning a String obtained by redirecting + # the program's standards errors and output + # + # TODO: find a way to expose some common errors (e.g. Errno::NOENT) + # to the caller def Utils.safe_exec(command, *args) - IO.popen("-") {|p| - if(p) - return p.readlines.join("\n") + output = IO.popen("-") { |p| + if p + break p.readlines.join("\n") else begin $stderr.reopen($stdout) exec(command, *args) rescue Exception => e - puts "exec of #{command} led to exception: #{e.pretty_inspect}" - Kernel::exit! 0 + puts "exception #{e.pretty_inspect} trying to run #{command}" + Kernel::exit! 1 end puts "exec of #{command} failed" - Kernel::exit! 0 + Kernel::exit! 1 end } + raise "safe execution of #{command} returned #{$?}" unless $?.success? + return output end + # Try executing an external program, returning true if the run was successful + # and false otherwise + def Utils.try_exec(command, *args) + IO.popen("-") { |p| + if p.nil? + begin + $stderr.reopen($stdout) + exec(command, *args) + rescue Exception => e + Kernel::exit! 1 + end + Kernel::exit! 1 + else + debug p.readlines + end + } + debug $? + return $?.success? + end + # Safely (atomically) save to _file_, by passing a tempfile to the block + # and then moving the tempfile to its final location when done. + # + # call-seq: Utils.safe_save(file, &block) + # def Utils.safe_save(file) raise 'No safe save directory defined!' if @@safe_save_dir.nil? basename = File.basename(file) @@ -425,19 +335,35 @@ module ::Irc end - def Utils.decode_html_entities(str) - if defined? ::HTMLEntities - return HTMLEntities.decode_entities(str) + # Decode HTML entities in the String _str_, using HTMLEntities if the + # package was found, or UNESCAPE_TABLE otherwise. + # + + if defined? ::HTMLEntities + if ::HTMLEntities.respond_to? :decode_entities + def Utils.decode_html_entities(str) + return HTMLEntities.decode_entities(str) + end else - str.gsub(/(&(.+?);)/) { + @@html_entities = HTMLEntities.new + def Utils.decode_html_entities(str) + return @@html_entities.decode str + end + end + else + def Utils.decode_html_entities(str) + return str.gsub(/(&(.+?);)/) { symbol = $2 # remove the 0-paddng from unicode integers - if symbol =~ /#(.+)/ - symbol = "##{$1.to_i.to_s}" + case symbol + when /^#x([0-9a-fA-F]+)$/ + symbol = $1.to_i(16).to_s + when /^#(\d+)$/ + symbol = $1.to_i.to_s end # output the symbol's irc-translated character, or a * if it's unknown - UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*' + UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [symbol.to_i].pack("U") : '*') } end end @@ -447,9 +373,9 @@ module ::Irc # # It is possible to pass some options to determine how the stripping # occurs. Currently supported options are - # * :strip => Regex or String to strip at the beginning of the obtained - # text - # * :min_spaces => Minimum number of spaces a paragraph should have + # strip:: Regex or String to strip at the beginning of the obtained + # text + # min_spaces:: minimum number of spaces a paragraph should have # def Utils.ircify_first_html_par(xml_org, opts={}) if defined? ::Hpricot @@ -459,14 +385,14 @@ module ::Irc end end - # with hpricot + # HTML first par grabber using hpricot def Utils.ircify_first_html_par_wh(xml_org, opts={}) doc = Hpricot(xml_org) # Strip styles and scripts (doc/"style|script").remove - debug doc.inspect + debug doc strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) @@ -476,25 +402,27 @@ module ::Irc txt = String.new - h = %w{h1 h2 h3 h4 h5 h6} - p = %w{p} - ar = [] - h.each { |hx| - p.each { |px| - ar << "#{hx}~#{px}" - } - } - h_p_css = ar.join("|") - debug "css search: #{h_p_css}" - pre_h = pars = by_span = nil while true debug "Minimum number of spaces: #{min_spaces}" # Initial attempt:

that follows - pre_h = doc/h_p_css if pre_h.nil? - debug "Hx: found: #{pre_h.pretty_inspect}" + if pre_h.nil? + pre_h = Hpricot::Elements[] + found_h = false + doc.search("*") { |e| + next if e.bogusetag? + case e.pathname + when /^h\d/ + found_h = true + when 'p' + pre_h << e if found_h + end + } + debug "Hx: found: #{pre_h.pretty_inspect}" + end + pre_h.each { |p| debug p txt = p.to_html.ircify_html @@ -526,13 +454,23 @@ module ::Irc # 'message' or 'text' in their class to mark actual text. Since we want # the class match to be partial and case insensitive, we collect # the common elements that may have this class and then filter out those - # we don't need + # we don't need. If no divs or spans are found, we'll accept additional + # elements too (td, tr, tbody, table). if by_span.nil? by_span = Hpricot::Elements[] - pre_pars = doc/"div|span|td|tr|tbody|table" - pre_pars.each { |el| - by_span.push el if el.class =~ /body|message|text/i + extra = Hpricot::Elements[] + doc.search("*") { |el| + next if el.bogusetag? + case el.pathname + when AFTER_PAR_PATH + by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS + when AFTER_PAR_EX + extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS + end } + if by_span.empty? and not extra.empty? + by_span.concat extra + end debug "other \#1: found: #{by_span.pretty_inspect}" end @@ -556,9 +494,13 @@ module ::Irc end end - # without hpricot + # HTML first par grabber without hpricot def Utils.ircify_first_html_par_woh(xml_org, opts={}) - xml = xml_org.gsub(//m, '').gsub(/]*)?>.*?<\/script>/im, "").gsub(/]*)?>.*?<\/style>/im, "") + xml = xml_org.gsub(//m, + "").gsub(/]*)?>.*?<\/script>/im, + "").gsub(/]*)?>.*?<\/style>/im, + "").gsub(/]*)?>.*?<\/select>/im, + "") strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) @@ -634,6 +576,143 @@ module ::Irc end end + # This method extracts title, content (first par) and extra + # information from the given document _doc_. + # + # _doc_ can be an URI, a Net::HTTPResponse or a String. + # + # If _doc_ is a String, only title and content information + # are retrieved (if possible), using standard methods. + # + # If _doc_ is an URI or a Net::HTTPResponse, additional + # information is retrieved, and special title/summary + # extraction routines are used if possible. + # + def Utils.get_html_info(doc, opts={}) + case doc + when String + Utils.get_string_html_info(doc, opts) + when Net::HTTPResponse + Utils.get_resp_html_info(doc, opts) + when URI + ret = DataStream.new + @@bot.httputil.get_response(doc) { |resp| + ret.replace Utils.get_resp_html_info(resp, opts) + } + return ret + else + raise + end + end + + class ::UrlLinkError < RuntimeError + end + + # This method extracts title, content (first par) and extra + # information from the given Net::HTTPResponse _resp_. + # + # Currently, the only accepted options (in _opts_) are + # uri_fragment:: the URI fragment of the original request + # full_body:: get the whole body instead of + # @@bot.config['http.info_bytes'] bytes only + # + # Returns a DataStream with the following keys: + # text:: the (partial) body + # title:: the title of the document (if any) + # content:: the first paragraph of the document (if any) + # headers:: + # the headers of the Net::HTTPResponse. The value is + # a Hash whose keys are lowercase forms of the HTTP + # header fields, and whose values are Arrays. + # + def Utils.get_resp_html_info(resp, opts={}) + case resp + when Net::HTTPSuccess + loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil + if loc and loc.fragment and not loc.fragment.empty? + opts[:uri_fragment] ||= loc.fragment + end + ret = DataStream.new(opts.dup) + ret[:headers] = resp.to_hash + ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes']) + + filtered = Utils.try_htmlinfo_filters(ret) + + if filtered + return filtered + elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/ + ret.merge!(Utils.get_string_html_info(partial, opts)) + end + return ret + else + raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})" + end + end + + # This method runs an appropriately-crafted DataStream _ds_ through the + # filters in the :htmlinfo filter group, in order. If one of the filters + # returns non-nil, its results are merged in _ds_ and returned. Otherwise + # nil is returned. + # + # The input DataStream should have the downloaded HTML as primary key + # (:text) and possibly a :headers key holding the resonse headers. + # + def Utils.try_htmlinfo_filters(ds) + filters = @@bot.filter_names(:htmlinfo) + return nil if filters.empty? + cur = nil + # TODO filter priority + filters.each { |n| + debug "testing htmlinfo filter #{n}" + cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds) + debug "returned #{cur.pretty_inspect}" + break if cur + } + return ds.merge(cur) if cur + end + + # HTML info filters often need to check if the webpage location + # of a passed DataStream _ds_ matches a given Regexp. + def Utils.check_location(ds, rx) + debug ds[:headers] + if h = ds[:headers] + loc = [h['x-rbot-location'],h['location']].flatten.grep(rx) + end + loc ||= [] + debug loc + return loc.empty? ? nil : loc + end + + # This method extracts title and content (first par) + # from the given HTML or XML document _text_, using + # standard methods (String#ircify_html_title, + # Utils.ircify_first_html_par) + # + # Currently, the only accepted option (in _opts_) is + # uri_fragment:: the URI fragment of the original request + # + def Utils.get_string_html_info(text, opts={}) + debug "getting string html info" + txt = text.dup + title = txt.ircify_html_title + debug opts + if frag = opts[:uri_fragment] and not frag.empty? + fragreg = /]+\s+)?(?:name|id)=["']?#{frag}["']?[^>]*>/im + debug fragreg + debug txt + if txt.match(fragreg) + # grab the post-match + txt = $' + end + debug txt + end + c_opts = opts.dup + c_opts[:strip] ||= title + content = Utils.ircify_first_html_par(txt, c_opts) + content = nil if content.empty? + return {:title => title, :content => content} + end + # Get the first pars of the first _count_ _urls_. # The pages are downloaded using the bot httputil service. # Returns an array of the first paragraphs fetched. @@ -648,32 +727,39 @@ module ::Irc url = urls.shift idx += 1 - # FIXME what happens if some big file is returned? We should share - # code with the url plugin to only retrieve partial file content! - xml = self.bot.httputil.get(url) - if xml.nil? - debug "Unable to retrieve #{url}" - next - end - par = Utils.ircify_first_html_par(xml, opts) - if par.empty? - debug "No first par found\n#{xml}" - # FIXME only do this if the 'url' plugin is loaded - # TODO even better, put the code here - # par = @bot.plugins['url'].get_title_from_html(xml) - if par.empty? - retval.push(nil) - next + begin + info = Utils.get_html_info(URI.parse(url), opts) + + par = info[:content] + retval.push(par) + + if par + msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg + count -=1 end + rescue + debug "Unable to retrieve #{url}: #{$!}" + next end - msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg - count -=1 - retval.push(par) end return retval end + # Returns a comma separated list except for the last element + # which is joined in with specified conjunction + # + def Utils.comma_list(words, options={}) + defaults = { :join_with => ", ", :join_last_with => _(" and ") } + opts = defaults.merge(options) + + if words.size < 2 + words.last + else + [words[0..-2].join(opts[:join_with]), words.last].join(opts[:join_last_with]) + end + end + end end -Irc::Utils.bot = Irc::Plugins.manager.bot +Irc::Utils.bot = Irc::Bot::Plugins.manager.bot