4 # :title: rbot utilities provider
6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
9 # Copyright:: (C) 2002-2006 Tom Gilbert
10 # Copyright:: (C) 2007 Giuseppe Bilotta
12 # TODO some of these Utils should be rewritten as extensions to the approriate
13 # standard Ruby classes and accordingly be moved to extends.rb
20 require 'htmlentities'
21 $we_have_html_entities_decoder = true
25 gems = require 'rubygems'
32 $we_have_html_entities_decoder = false
55 # extras codes, for future use...
69 'otimes' => '⊗',
78 'Epsilon' => 'Ε',
82 'Upsilon' => 'Υ',
84 'there4' => '∴',
89 'rsaquo' => '›',
101 'lceil' => '⌈',
103 'rdquo' => '”',
111 'lfloor' => '⌊',
118 'clubs' => '♣',
119 'diams' => '♦',
126 'Scaron' => 'Š',
132 'sbquo' => '‚',
145 'infin' => '∞',
150 'thinsp' => ' ',
152 'bdquo' => '„',
159 'mdash' => '—',
161 'permil' => '‰',
166 'forall' => '∀',
168 'rceil' => '⌉',
171 'lambda' => 'λ',
175 'dagger' => '†',
178 'image' => 'ℑ',
179 'alefsym' => 'ℵ',
185 'frasl' => '⁄',
187 'lowast' => '∗',
198 'oline' => '‾',
205 'empty' => '∅',
212 'weierp' => '℘',
217 'omicron' => 'ο',
218 'upsilon' => 'υ',
220 'Lambda' => 'Λ',
227 'scaron' => 'š',
228 'lsquo' => '‘',
236 'hellip' => '…',
240 'rfloor' => '⌋',
242 'crarr' => '↵',
244 'notin' => '∉',
245 'exist' => '∃',
248 'Dagger' => '‡',
249 'oplus' => '⊕',
255 'lsaquo' => '‹',
257 'Omicron' => 'Ο',
272 'sigmaf' => 'ς',
274 'minus' => '−',
277 'epsilon' => 'ε',
288 'spades' => '♠',
289 'rsquo' => '’',
293 'thetasym' => 'ϑ',
297 'ldquo' => '“',
298 'hearts' => '♥',
311 # miscellaneous useful functions
314 SEC_PER_HR = SEC_PER_MIN * 60
315 SEC_PER_DAY = SEC_PER_HR * 24
316 SEC_PER_MNTH = SEC_PER_DAY * 30
317 SEC_PER_YR = SEC_PER_MNTH * 12
319 def Utils.secs_to_string_case(array, var, string, plural)
322 array << "1 #{string}"
324 array << "#{var} #{plural}"
328 # turn a number of seconds into a human readable string, e.g
329 # 2 days, 3 hours, 18 minutes, 10 seconds
330 def Utils.secs_to_string(secs)
332 years, secs = secs.divmod SEC_PER_YR
333 secs_to_string_case(ret, years, "year", "years") if years > 0
334 months, secs = secs.divmod SEC_PER_MNTH
335 secs_to_string_case(ret, months, "month", "months") if months > 0
336 days, secs = secs.divmod SEC_PER_DAY
337 secs_to_string_case(ret, days, "day", "days") if days > 0
338 hours, secs = secs.divmod SEC_PER_HR
339 secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
340 mins, secs = secs.divmod SEC_PER_MIN
341 secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
343 secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
346 raise "Empty ret array!"
350 return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
355 def Utils.safe_exec(command, *args)
358 return p.readlines.join("\n")
363 rescue Exception => e
364 puts "exec of #{command} led to exception: #{e.inspect}"
367 puts "exec of #{command} failed"
374 @@safe_save_dir = nil unless defined?(@@safe_save_dir)
375 def Utils.set_safe_save_dir(str)
376 @@safe_save_dir = str.dup
379 def Utils.safe_save(file)
380 raise 'No safe save directory defined!' if @@safe_save_dir.nil?
381 basename = File.basename(file)
382 temp = Tempfile.new(basename,@@safe_save_dir)
384 yield temp if block_given?
386 File.rename(temp.path, file)
390 # returns a string containing the result of an HTTP GET on the uri
391 def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
393 # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
394 Net::HTTP.version_1_2
395 # (so we support the 1_1 api anyway, avoids problems)
397 uri = URI.parse uristr
400 query += "?#{uri.query}"
405 if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
406 proxy_host = proxy_uri.host
407 proxy_port = proxy_uri.port
411 http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
412 http.open_timeout = opentimeout
413 http.read_timeout = readtimeout
416 resp = http.get(query)
417 if resp.code == "200"
423 error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
428 def Utils.decode_html_entities(str)
429 if $we_have_html_entities_decoder
430 return HTMLEntities.decode_entities(str)
432 str.gsub(/(&(.+?);)/) {
434 # remove the 0-paddng from unicode integers
436 symbol = "##{$1.to_i.to_s}"
439 # output the symbol's irc-translated character, or a * if it's unknown
440 UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
445 HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
446 PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
448 # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
449 # to mark actual text
450 AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
452 # At worst, we can try stuff which is comprised between two <br>
453 AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
455 # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
456 # If possible, grab the one after the first heading
458 # It is possible to pass some options to determine how the stripping
459 # occurs. Currently supported options are
460 # * :strip => Regex or String to strip at the beginning of the obtained
462 # * :min_spaces => Minimum number of spaces a paragraph should have
464 def Utils.ircify_first_html_par(xml_org, opts={})
465 xml = xml_org.gsub(/<!--.*?-->/, '')
468 strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
470 min_spaces = opts[:min_spaces] || 8
471 min_spaces = 0 if min_spaces < 0
476 debug "Minimum number of spaces: #{min_spaces}"
477 header_found = xml.match(HX_REGEX)
480 while txt.empty? or txt.count(" ") < min_spaces
481 candidate = header_found[PAR_REGEX]
482 break unless candidate
483 txt = candidate.ircify_html
485 txt.sub!(strip, '') if strip
486 debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
490 return txt unless txt.empty? or txt.count(" ") < min_spaces
492 # If we haven't found a first par yet, try to get it from the whole
495 while txt.empty? or txt.count(" ") < min_spaces
496 candidate = header_found[PAR_REGEX]
497 break unless candidate
498 txt = candidate.ircify_html
500 txt.sub!(strip, '') if strip
501 debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
504 return txt unless txt.empty? or txt.count(" ") < min_spaces
506 # Nothing yet ... let's get drastic: we look for non-par elements too,
507 # but only for those that match something that we know is likely to
512 while txt.empty? or txt.count(" ") < min_spaces
513 candidate = header_found[AFTER_PAR1_REGEX]
514 break unless candidate
515 txt = candidate.ircify_html
517 txt.sub!(strip, '') if strip
518 debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
521 return txt unless txt.empty? or txt.count(" ") < min_spaces
525 while txt.empty? or txt.count(" ") < min_spaces
526 candidate = header_found[AFTER_PAR2_REGEX]
527 break unless candidate
528 txt = candidate.ircify_html
530 txt.sub!(strip, '') if strip
531 debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
534 debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
535 return txt unless txt.count(" ") < min_spaces
540 # Get the first pars of the first _count_ _urls_.
541 # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
542 # and echoed as replies to the IRC message passed as _opts_ :message.
544 def Utils.get_first_pars(urls, count, opts={})
547 while count > 0 and urls.length > 0
551 # FIXME what happens if some big file is returned? We should share
552 # code with the url plugin to only retrieve partial file content!
553 xml = opts[:http_util].get_cached(url)
555 debug "Unable to retrieve #{url}"
558 par = Utils.ircify_first_html_par(xml, opts)
560 debug "No first par found\n#{xml}"
561 # FIXME only do this if the 'url' plugin is loaded
562 # TODO even better, put the code here
563 # par = @bot.plugins['url'].get_title_from_html(xml)
566 msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg