# TODO some of these Utils should be rewritten as extensions to the approriate
# standard Ruby classes and accordingly be moved to extends.rb
-require 'net/http'
-require 'uri'
require 'tempfile'
+require 'set'
+# Try to load htmlentities, fall back to an HTML escape table.
begin
require 'htmlentities'
- $we_have_html_entities_decoder = true
rescue LoadError
gems = nil
begin
if gems
retry
else
- $we_have_html_entities_decoder = false
module ::Irc
module Utils
UNESCAPE_TABLE = {
- 'laquo' => '<<',
- 'raquo' => '>>',
+ 'laquo' => '«',
+ 'raquo' => '»',
'quot' => '"',
'apos' => '\'',
- 'micro' => 'u',
- 'copy' => '(c)',
- 'trade' => '(tm)',
- 'reg' => '(R)',
- '#174' => '(R)',
- '#8220' => '"',
- '#8221' => '"',
- '#8212' => '--',
- '#39' => '\'',
+ 'micro' => 'µ',
+ 'copy' => '©',
+ 'trade' => '™',
+ 'reg' => '®',
'amp' => '&',
'lt' => '<',
'gt' => '>',
- 'hellip' => '...',
- 'nbsp' => ' ',
-=begin
- # extras codes, for future use...
- 'zwnj' => '‌',
- 'aring' => '\xe5',
- 'gt' => '>',
- 'yen' => '\xa5',
- 'ograve' => '\xf2',
- 'Chi' => 'Χ',
- 'bull' => '•',
- 'Egrave' => '\xc8',
- 'Ntilde' => '\xd1',
- 'upsih' => 'ϒ',
- 'Yacute' => '\xdd',
- 'asymp' => '≈',
- 'radic' => '√',
- 'otimes' => '⊗',
- 'nabla' => '∇',
- 'aelig' => '\xe6',
- 'oelig' => 'œ',
- 'equiv' => '≡',
- 'Psi' => 'Ψ',
- 'auml' => '\xe4',
- 'circ' => 'ˆ',
- 'Acirc' => '\xc2',
- 'Epsilon' => 'Ε',
- 'Yuml' => 'Ÿ',
- 'Eta' => 'Η',
- 'Icirc' => '\xce',
- 'Upsilon' => 'Υ',
- 'ndash' => '–',
- 'there4' => '∴',
- 'Prime' => '″',
- 'prime' => '′',
- 'psi' => 'ψ',
- 'Kappa' => 'Κ',
- 'rsaquo' => '›',
- 'Tau' => 'Τ',
- 'darr' => '↓',
- 'ocirc' => '\xf4',
- 'lrm' => '‎',
- 'zwj' => '‍',
- 'cedil' => '\xb8',
- 'Ecirc' => '\xca',
- 'not' => '\xac',
- 'AElig' => '\xc6',
- 'oslash' => '\xf8',
- 'acute' => '\xb4',
- 'lceil' => '⌈',
- 'shy' => '\xad',
- 'rdquo' => '”',
- 'ge' => '≥',
- 'Igrave' => '\xcc',
- 'Ograve' => '\xd2',
- 'euro' => '€',
- 'dArr' => '⇓',
- 'sdot' => '⋅',
- 'nbsp' => '\xa0',
- 'lfloor' => '⌊',
- 'lArr' => '⇐',
- 'Auml' => '\xc4',
- 'larr' => '←',
- 'Atilde' => '\xc3',
- 'Otilde' => '\xd5',
- 'szlig' => '\xdf',
- 'clubs' => '♣',
- 'diams' => '♦',
- 'agrave' => '\xe0',
- 'Ocirc' => '\xd4',
- 'Iota' => 'Ι',
- 'Theta' => 'Θ',
- 'Pi' => 'Π',
- 'OElig' => 'Œ',
- 'Scaron' => 'Š',
- 'frac14' => '\xbc',
- 'egrave' => '\xe8',
- 'sub' => '⊂',
- 'iexcl' => '\xa1',
- 'frac12' => '\xbd',
- 'sbquo' => '‚',
- 'ordf' => '\xaa',
- 'sum' => '∑',
- 'prop' => '∝',
- 'Uuml' => '\xdc',
- 'ntilde' => '\xf1',
- 'sup' => '⊃',
- 'theta' => 'θ',
- 'prod' => '∏',
- 'nsub' => '⊄',
- 'hArr' => '⇔',
- 'rlm' => '‏',
- 'THORN' => '\xde',
- 'infin' => '∞',
- 'yuml' => '\xff',
- 'Mu' => 'Μ',
- 'le' => '≤',
- 'Eacute' => '\xc9',
- 'thinsp' => ' ',
- 'ecirc' => '\xea',
- 'bdquo' => '„',
- 'Sigma' => 'Σ',
- 'fnof' => 'ƒ',
- 'Aring' => '\xc5',
- 'tilde' => '˜',
- 'frac34' => '\xbe',
- 'emsp' => ' ',
- 'mdash' => '—',
- 'uarr' => '↑',
- 'permil' => '‰',
- 'Ugrave' => '\xd9',
- 'rarr' => '→',
- 'Agrave' => '\xc0',
- 'chi' => 'χ',
- 'forall' => '∀',
- 'eth' => '\xf0',
- 'rceil' => '⌉',
- 'iuml' => '\xef',
- 'gamma' => 'γ',
- 'lambda' => 'λ',
- 'harr' => '↔',
- 'rang' => '〉',
- 'xi' => 'ξ',
- 'dagger' => '†',
- 'divide' => '\xf7',
- 'Ouml' => '\xd6',
- 'image' => 'ℑ',
- 'alefsym' => 'ℵ',
- 'igrave' => '\xec',
- 'otilde' => '\xf5',
- 'Oacute' => '\xd3',
- 'sube' => '⊆',
- 'alpha' => 'α',
- 'frasl' => '⁄',
- 'ETH' => '\xd0',
- 'lowast' => '∗',
- 'Nu' => 'Ν',
- 'plusmn' => '\xb1',
- 'Euml' => '\xcb',
- 'real' => 'ℜ',
- 'sup1' => '\xb9',
- 'sup2' => '\xb2',
- 'sup3' => '\xb3',
- 'Oslash' => '\xd8',
- 'Aacute' => '\xc1',
- 'cent' => '\xa2',
- 'oline' => '‾',
- 'Beta' => 'Β',
- 'perp' => '⊥',
- 'Delta' => 'Δ',
- 'loz' => '◊',
- 'pi' => 'π',
- 'iota' => 'ι',
- 'empty' => '∅',
- 'euml' => '\xeb',
- 'brvbar' => '\xa6',
- 'iacute' => '\xed',
- 'para' => '\xb6',
- 'micro' => '\xb5',
- 'cup' => '∪',
- 'weierp' => '℘',
- 'uuml' => '\xfc',
- 'part' => '∂',
- 'icirc' => '\xee',
- 'delta' => 'δ',
- 'omicron' => 'ο',
- 'upsilon' => 'υ',
- 'Iuml' => '\xcf',
- 'Lambda' => 'Λ',
- 'Xi' => 'Ξ',
- 'kappa' => 'κ',
- 'ccedil' => '\xe7',
- 'Ucirc' => '\xdb',
- 'cap' => '∩',
- 'mu' => 'μ',
- 'scaron' => 'š',
- 'lsquo' => '‘',
- 'isin' => '∈',
- 'Zeta' => 'Ζ',
- 'supe' => '⊇',
- 'deg' => '\xb0',
- 'and' => '∧',
- 'tau' => 'τ',
- 'pound' => '\xa3',
- 'hellip' => '…',
- 'curren' => '\xa4',
- 'int' => '∫',
- 'ucirc' => '\xfb',
- 'rfloor' => '⌋',
- 'ensp' => ' ',
- 'crarr' => '↵',
- 'ugrave' => '\xf9',
- 'notin' => '∉',
- 'exist' => '∃',
- 'uArr' => '⇑',
- 'cong' => '≅',
- 'Dagger' => '‡',
- 'oplus' => '⊕',
- 'times' => '\xd7',
- 'atilde' => '\xe3',
- 'piv' => 'ϖ',
- 'ni' => '∋',
- 'Phi' => 'Φ',
- 'lsaquo' => '‹',
- 'Uacute' => '\xda',
- 'Omicron' => 'Ο',
- 'ang' => '∠',
- 'ne' => '≠',
- 'iquest' => '\xbf',
- 'eta' => 'η',
- 'yacute' => '\xfd',
- 'Rho' => 'Ρ',
- 'uacute' => '\xfa',
- 'Alpha' => 'Α',
- 'zeta' => 'ζ',
- 'Omega' => 'Ω',
- 'nu' => 'ν',
- 'sim' => '∼',
- 'sect' => '\xa7',
- 'phi' => 'φ',
- 'sigmaf' => 'ς',
- 'macr' => '\xaf',
- 'minus' => '−',
- 'Ccedil' => '\xc7',
- 'ordm' => '\xba',
- 'epsilon' => 'ε',
- 'beta' => 'β',
- 'rArr' => '⇒',
- 'rho' => 'ρ',
- 'aacute' => '\xe1',
- 'eacute' => '\xe9',
- 'omega' => 'ω',
- 'middot' => '\xb7',
- 'Gamma' => 'Γ',
- 'Iacute' => '\xcd',
- 'lang' => '〈',
- 'spades' => '♠',
- 'rsquo' => '’',
- 'uml' => '\xa8',
- 'thorn' => '\xfe',
- 'ouml' => '\xf6',
- 'thetasym' => 'ϑ',
- 'or' => '∨',
- 'raquo' => '\xbb',
- 'acirc' => '\xe2',
- 'ldquo' => '“',
- 'hearts' => '♥',
- 'sigma' => 'σ',
- 'oacute' => '\xf3',
-=end
+ 'hellip' => '…',
+ 'nbsp' => ' ',
+ 'Agrave' => 'À',
+ 'Aacute' => 'Á',
+ 'Acirc' => 'Â',
+ 'Atilde' => 'Ã',
+ 'Auml' => 'Ä',
+ 'Aring' => 'Å',
+ 'AElig' => 'Æ',
+ 'OElig' => 'Œ',
+ 'Ccedil' => 'Ç',
+ 'Egrave' => 'È',
+ 'Eacute' => 'É',
+ 'Ecirc' => 'Ê',
+ 'Euml' => 'Ë',
+ 'Igrave' => 'Ì',
+ 'Iacute' => 'Í',
+ 'Icirc' => 'Î',
+ 'Iuml' => 'Ï',
+ 'ETH' => 'Ð',
+ 'Ntilde' => 'Ñ',
+ 'Ograve' => 'Ò',
+ 'Oacute' => 'Ó',
+ 'Ocirc' => 'Ô',
+ 'Otilde' => 'Õ',
+ 'Ouml' => 'Ö',
+ 'Oslash' => 'Ø',
+ 'Ugrave' => 'Ù',
+ 'Uacute' => 'Ú',
+ 'Ucirc' => 'Û',
+ 'Uuml' => 'Ü',
+ 'Yacute' => 'Ý',
+ 'THORN' => 'Þ',
+ 'szlig' => 'ß',
+ 'agrave' => 'à',
+ 'aacute' => 'á',
+ 'acirc' => 'â',
+ 'atilde' => 'ã',
+ 'auml' => 'ä',
+ 'aring' => 'å',
+ 'aelig' => 'æ',
+ 'oelig' => 'œ',
+ 'ccedil' => 'ç',
+ 'egrave' => 'è',
+ 'eacute' => 'é',
+ 'ecirc' => 'ê',
+ 'euml' => 'ë',
+ 'igrave' => 'ì',
+ 'iacute' => 'í',
+ 'icirc' => 'î',
+ 'iuml' => 'ï',
+ 'eth' => 'ð',
+ 'ntilde' => 'ñ',
+ 'ograve' => 'ò',
+ 'oacute' => 'ó',
+ 'ocirc' => 'ô',
+ 'otilde' => 'õ',
+ 'ouml' => 'ö',
+ 'oslash' => 'ø',
+ 'ugrave' => 'ù',
+ 'uacute' => 'ú',
+ 'ucirc' => 'û',
+ 'uuml' => 'ü',
+ 'yacute' => 'ý',
+ 'thorn' => 'þ',
+ 'yuml' => 'ÿ'
}
end
end
end
end
+begin
+ require 'hpricot'
+ module ::Irc
+ module Utils
+ AFTER_PAR_PATH = /^(?:div|span)$/
+ AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/
+ AFTER_PAR_CLASS = /body|message|text/i
+ end
+ end
+rescue LoadError
+ gems = nil
+ begin
+ gems = require 'rubygems'
+ rescue LoadError
+ gems = false
+ end
+ if gems
+ retry
+ else
+ module ::Irc
+ module Utils
+ # Some regular expressions to manage HTML data
+
+ # Title
+ TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
+
+ # H1, H2, etc
+ HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
+ # A paragraph
+ PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
+ # to mark actual text
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # At worst, we can try stuff which is comprised between two <br>
+ AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
+ end
+ end
+ end
+end
module ::Irc
- # miscellaneous useful functions
+ # Miscellaneous useful functions
module Utils
@@bot = nil unless defined? @@bot
@@safe_save_dir = nil unless defined?(@@safe_save_dir)
+ # The bot instance
def Utils.bot
@@bot
end
+ # Set up some Utils routines which depend on the associated bot.
def Utils.bot=(b)
+ debug "initializing utils"
@@bot = b
@@safe_save_dir = "#{@@bot.botclass}/safe_save"
end
+ # Seconds per minute
SEC_PER_MIN = 60
+ # Seconds per hour
SEC_PER_HR = SEC_PER_MIN * 60
+ # Seconds per day
SEC_PER_DAY = SEC_PER_HR * 24
+ # Seconds per (30-day) month
SEC_PER_MNTH = SEC_PER_DAY * 30
+ # Second per (30*12 = 360 day) year
SEC_PER_YR = SEC_PER_MNTH * 12
+ # Auxiliary method needed by Utils.secs_to_string
def Utils.secs_to_string_case(array, var, string, plural)
case var
when 1
end
end
- # turn a number of seconds into a human readable string, e.g
- # 2 days, 3 hours, 18 minutes, 10 seconds
+ # Turn a number of seconds into a human readable string, e.g
+ # 2 days, 3 hours, 18 minutes and 10 seconds
def Utils.secs_to_string(secs)
ret = []
years, secs = secs.divmod SEC_PER_YR
- secs_to_string_case(ret, years, "year", "years") if years > 0
+ secs_to_string_case(ret, years, _("year"), _("years")) if years > 0
months, secs = secs.divmod SEC_PER_MNTH
- secs_to_string_case(ret, months, "month", "months") if months > 0
+ secs_to_string_case(ret, months, _("month"), _("months")) if months > 0
days, secs = secs.divmod SEC_PER_DAY
- secs_to_string_case(ret, days, "day", "days") if days > 0
+ secs_to_string_case(ret, days, _("day"), _("days")) if days > 0
hours, secs = secs.divmod SEC_PER_HR
- secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
+ secs_to_string_case(ret, hours, _("hour"), _("hours")) if hours > 0
mins, secs = secs.divmod SEC_PER_MIN
- secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
+ secs_to_string_case(ret, mins, _("minute"), _("minutes")) if mins > 0
secs = secs.to_i
- secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
+ secs_to_string_case(ret, secs, _("second"), _("seconds")) if secs > 0 or ret.empty?
case ret.length
when 0
raise "Empty ret array!"
when 1
return ret.to_s
else
- return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
+ return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
end
end
+ # Execute an external program, returning a String obtained by redirecting
+ # the program's standards errors and output
+ #
def Utils.safe_exec(command, *args)
- IO.popen("-") {|p|
- if(p)
+ IO.popen("-") { |p|
+ if p
return p.readlines.join("\n")
else
begin
- $stderr = $stdout
+ $stderr.reopen($stdout)
exec(command, *args)
rescue Exception => e
- puts "exec of #{command} led to exception: #{e.inspect}"
+ puts "exec of #{command} led to exception: #{e.pretty_inspect}"
Kernel::exit! 0
end
puts "exec of #{command} failed"
end
+ # Safely (atomically) save to _file_, by passing a tempfile to the block
+ # and then moving the tempfile to its final location when done.
+ #
+ # call-seq: Utils.safe_save(file, &block)
+ #
def Utils.safe_save(file)
raise 'No safe save directory defined!' if @@safe_save_dir.nil?
basename = File.basename(file)
end
+ # Decode HTML entities in the String _str_, using HTMLEntities if the
+ # package was found, or UNESCAPE_TABLE otherwise.
+ #
def Utils.decode_html_entities(str)
- if $we_have_html_entities_decoder
+ if defined? ::HTMLEntities
return HTMLEntities.decode_entities(str)
else
str.gsub(/(&(.+?);)/) {
end
# output the symbol's irc-translated character, or a * if it's unknown
- UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
+ UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [$0.to_i].pack("U") : '*')
}
end
end
- HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
- PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
- # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
- # to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
- # At worst, we can try stuff which is comprised between two <br>
- AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
-
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
# If possible, grab the one after the first heading
#
# It is possible to pass some options to determine how the stripping
# occurs. Currently supported options are
- # * :strip => Regex or String to strip at the beginning of the obtained
- # text
- # * :min_spaces => Minimum number of spaces a paragraph should have
+ # strip:: Regex or String to strip at the beginning of the obtained
+ # text
+ # min_spaces:: minimum number of spaces a paragraph should have
#
def Utils.ircify_first_html_par(xml_org, opts={})
- xml = xml_org.gsub(/<!--.*?-->/, '')
+ if defined? ::Hpricot
+ Utils.ircify_first_html_par_wh(xml_org, opts)
+ else
+ Utils.ircify_first_html_par_woh(xml_org, opts)
+ end
+ end
+
+ # HTML first par grabber using hpricot
+ def Utils.ircify_first_html_par_wh(xml_org, opts={})
+ doc = Hpricot(xml_org)
+
+ # Strip styles and scripts
+ (doc/"style|script").remove
+
+ debug doc
+
+ strip = opts[:strip]
+ strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+ min_spaces = opts[:min_spaces] || 8
+ min_spaces = 0 if min_spaces < 0
+
+ txt = String.new
+
+ pre_h = pars = by_span = nil
+
+ while true
+ debug "Minimum number of spaces: #{min_spaces}"
+
+ # Initial attempt: <p> that follows <h\d>
+ if pre_h.nil?
+ pre_h = Hpricot::Elements[]
+ found_h = false
+ doc.search("*") { |e|
+ next if e.bogusetag?
+ case e.pathname
+ when /^h\d/
+ found_h = true
+ when 'p'
+ pre_h << e if found_h
+ end
+ }
+ debug "Hx: found: #{pre_h.pretty_inspect}"
+ end
+
+ pre_h.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Second natural attempt: just get any <p>
+ pars = doc/"p" if pars.nil?
+ debug "par: found: #{pars.pretty_inspect}"
+ pars.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Nothing yet ... let's get drastic: we look for non-par elements too,
+ # but only for those that match something that we know is likely to
+ # contain text
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or
+ # 'message' or 'text' in their class to mark actual text. Since we want
+ # the class match to be partial and case insensitive, we collect
+ # the common elements that may have this class and then filter out those
+ # we don't need. If no divs or spans are found, we'll accept additional
+ # elements too (td, tr, tbody, table).
+ if by_span.nil?
+ by_span = Hpricot::Elements[]
+ extra = Hpricot::Elements[]
+ doc.search("*") { |el|
+ next if el.bogusetag?
+ case el.pathname
+ when AFTER_PAR_PATH
+ by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+ when AFTER_PAR_EX
+ extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+ end
+ }
+ if by_span.empty? and not extra.empty?
+ by_span.concat extra
+ end
+ debug "other \#1: found: #{by_span.pretty_inspect}"
+ end
+
+ by_span.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # At worst, we can try stuff which is comprised between two <br>
+ # TODO
+
+ debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
+ return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
+ min_spaces /= 2
+ end
+ end
+
+ # HTML first par grabber without hpricot
+ def Utils.ircify_first_html_par_woh(xml_org, opts={})
+ xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
min_spaces /= 2
end
end
+ # This method extracts title, content (first par) and extra
+ # information from the given document _doc_.
+ #
+ # _doc_ can be an URI, a Net::HTTPResponse or a String.
+ #
+ # If _doc_ is a String, only title and content information
+ # are retrieved (if possible), using standard methods.
+ #
+ # If _doc_ is an URI or a Net::HTTPResponse, additional
+ # information is retrieved, and special title/summary
+ # extraction routines are used if possible.
+ #
+ def Utils.get_html_info(doc, opts={})
+ case doc
+ when String
+ Utils.get_string_html_info(doc, opts)
+ when Net::HTTPResponse
+ Utils.get_resp_html_info(doc, opts)
+ when URI
+ ret = Hash.new
+ @@bot.httputil.get_response(doc) { |resp|
+ ret = Utils.get_resp_html_info(resp, opts)
+ }
+ return ret
+ else
+ raise
+ end
+ end
+
+ class ::UrlLinkError < RuntimeError
+ end
+
+ # This method extracts title, content (first par) and extra
+ # information from the given Net::HTTPResponse _resp_.
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ # Returns a Hash with the following keys:
+ # title:: the title of the document (if any)
+ # content:: the first paragraph of the document (if any)
+ # headers::
+ # the headers of the Net::HTTPResponse. The value is
+ # a Hash whose keys are lowercase forms of the HTTP
+ # header fields, and whose values are Arrays.
+ #
+ def Utils.get_resp_html_info(resp, opts={})
+ ret = Hash.new
+ case resp
+ when Net::HTTPSuccess
+ ret[:headers] = resp.to_hash
+
+ partial = resp.partial_body(@@bot.config['http.info_bytes'])
+ if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+ loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
+ if loc and loc.fragment and not loc.fragment.empty?
+ opts[:uri_fragment] ||= loc.fragment
+ end
+ ret.merge!(Utils.get_string_html_info(partial, opts))
+ end
+ return ret
+ else
+ raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+ end
+ end
+
+ # This method extracts title and content (first par)
+ # from the given HTML or XML document _text_, using
+ # standard methods (String#ircify_html_title,
+ # Utils.ircify_first_html_par)
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ def Utils.get_string_html_info(text, opts={})
+ txt = text.dup
+ title = txt.ircify_html_title
+ if frag = opts[:uri_fragment] and not frag.empty?
+ fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
+ txt.sub!(fragreg,'')
+ end
+ c_opts = opts.dup
+ c_opts[:strip] ||= title
+ content = Utils.ircify_first_html_par(txt, c_opts)
+ content = nil if content.empty?
+ return {:title => title, :content => content}
+ end
+
# Get the first pars of the first _count_ _urls_.
# The pages are downloaded using the bot httputil service.
# Returns an array of the first paragraphs fetched.
url = urls.shift
idx += 1
- # FIXME what happens if some big file is returned? We should share
- # code with the url plugin to only retrieve partial file content!
- xml = self.bot.httputil.get_cached(url)
- if xml.nil?
- debug "Unable to retrieve #{url}"
- next
- end
- par = Utils.ircify_first_html_par(xml, opts)
- if par.empty?
- debug "No first par found\n#{xml}"
- # FIXME only do this if the 'url' plugin is loaded
- # TODO even better, put the code here
- # par = @bot.plugins['url'].get_title_from_html(xml)
- if par.empty?
- retval.push(nil)
- next
+ begin
+ info = Utils.get_html_info(URI.parse(url), opts)
+
+ par = info[:content]
+ retval.push(par)
+
+ if par
+ msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+ count -=1
end
+ rescue
+ debug "Unable to retrieve #{url}: #{$!}"
+ next
end
- msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
- count -=1
- retval.push(par)
end
return retval
end
-
end
end
+
+Irc::Utils.bot = Irc::Bot::Plugins.manager.bot