4 # :title: rbot utilities provider
6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
9 # Copyright:: (C) 2002-2006 Tom Gilbert
10 # Copyright:: (C) 2007 Giuseppe Bilotta
12 # TODO some of these Utils should be rewritten as extensions to the approriate
13 # standard Ruby classes and accordingly be moved to extends.rb
20 require 'htmlentities'
21 $we_have_html_entities_decoder = true
23 gems = require 'rubygems' rescue false
27 $we_have_html_entities_decoder = false
50 # extras codes, for future use...
64 'otimes' => '⊗',
73 'Epsilon' => 'Ε',
77 'Upsilon' => 'Υ',
79 'there4' => '∴',
84 'rsaquo' => '›',
106 'lfloor' => '⌊',
113 'clubs' => '♣',
114 'diams' => '♦',
121 'Scaron' => 'Š',
127 'sbquo' => '‚',
140 'infin' => '∞',
145 'thinsp' => ' ',
147 'bdquo' => '„',
154 'mdash' => '—',
156 'permil' => '‰',
161 'forall' => '∀',
163 'rceil' => '⌉',
166 'lambda' => 'λ',
170 'dagger' => '†',
173 'image' => 'ℑ',
174 'alefsym' => 'ℵ',
180 'frasl' => '⁄',
182 'lowast' => '∗',
193 'oline' => '‾',
200 'empty' => '∅',
207 'weierp' => '℘',
212 'omicron' => 'ο',
213 'upsilon' => 'υ',
215 'Lambda' => 'Λ',
222 'scaron' => 'š',
223 'lsquo' => '‘',
231 'hellip' => '…',
235 'rfloor' => '⌋',
237 'crarr' => '↵',
239 'notin' => '∉',
240 'exist' => '∃',
243 'Dagger' => '‡',
244 'oplus' => '⊕',
250 'lsaquo' => '‹',
252 'Omicron' => 'Ο',
267 'sigmaf' => 'ς',
269 'minus' => '−',
272 'epsilon' => 'ε',
283 'spades' => '♠',
284 'rsquo' => '’',
288 'thetasym' => 'ϑ',
292 'ldquo' => '“',
293 'hearts' => '♥',
306 # miscellaneous useful functions
309 SEC_PER_HR = SEC_PER_MIN * 60
310 SEC_PER_DAY = SEC_PER_HR * 24
311 SEC_PER_MNTH = SEC_PER_DAY * 30
312 SEC_PER_YR = SEC_PER_MNTH * 12
314 def Utils.secs_to_string_case(array, var, string, plural)
317 array << "1 #{string}"
319 array << "#{var} #{plural}"
323 # turn a number of seconds into a human readable string, e.g
324 # 2 days, 3 hours, 18 minutes, 10 seconds
325 def Utils.secs_to_string(secs)
327 years, secs = secs.divmod SEC_PER_YR
328 secs_to_string_case(ret, years, "year", "years") if years > 0
329 months, secs = secs.divmod SEC_PER_MNTH
330 secs_to_string_case(ret, months, "month", "months") if months > 0
331 days, secs = secs.divmod SEC_PER_DAY
332 secs_to_string_case(ret, days, "day", "days") if days > 0
333 hours, secs = secs.divmod SEC_PER_HR
334 secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
335 mins, secs = secs.divmod SEC_PER_MIN
336 secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
338 secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
341 raise "Empty ret array!"
345 return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
350 def Utils.safe_exec(command, *args)
353 return p.readlines.join("\n")
358 rescue Exception => e
359 puts "exec of #{command} led to exception: #{e.inspect}"
362 puts "exec of #{command} failed"
369 @@safe_save_dir = nil unless defined?(@@safe_save_dir)
370 def Utils.set_safe_save_dir(str)
371 @@safe_save_dir = str.dup
374 def Utils.safe_save(file)
375 raise 'No safe save directory defined!' if @@safe_save_dir.nil?
376 basename = File.basename(file)
377 temp = Tempfile.new(basename,@@safe_save_dir)
379 yield temp if block_given?
381 File.rename(temp.path, file)
385 # returns a string containing the result of an HTTP GET on the uri
386 def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
388 # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
389 Net::HTTP.version_1_2
390 # (so we support the 1_1 api anyway, avoids problems)
392 uri = URI.parse uristr
395 query += "?#{uri.query}"
400 if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
401 proxy_host = proxy_uri.host
402 proxy_port = proxy_uri.port
406 http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
407 http.open_timeout = opentimeout
408 http.read_timeout = readtimeout
411 resp = http.get(query)
412 if resp.code == "200"
418 error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
423 def Utils.decode_html_entities(str)
424 if $we_have_html_entities_decoder
425 return HTMLEntities.decode_entities(str)
427 str.gsub(/(&(.+?);)/) {
429 # remove the 0-paddng from unicode integers
431 symbol = "##{$1.to_i.to_s}"
434 # output the symbol's irc-translated character, or a * if it's unknown
435 UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
440 HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
441 PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
443 # Some blogging and forum platforms use spans or divs with a 'body' in their class
444 # to mark actual text
445 AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
447 # At worst, we can try stuff which is comprised between two <br>
448 AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
450 # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
451 # If possible, grab the one after the first heading
453 # It is possible to pass some options to determine how the stripping
454 # occurs. Currently supported options are
455 # * :strip => Regex or String to strip at the beginning of the obtained
457 # * :min_spaces => Minimum number of spaces a paragraph should have
459 def Utils.ircify_first_html_par(xml, opts={})
463 strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
465 min_spaces = opts[:min_spaces] || 8
466 min_spaces = 0 if min_spaces < 0
469 debug "Minimum number of spaces: #{min_spaces}"
470 header_found = xml.match(HX_REGEX)
473 while txt.empty? or txt.count(" ") < min_spaces
474 candidate = header_found[PAR_REGEX]
475 break unless candidate
476 txt = candidate.ircify_html
478 txt.sub!(strip, '') if strip
479 debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
483 return txt unless txt.empty? or txt.count(" ") < min_spaces
485 # If we haven't found a first par yet, try to get it from the whole
488 while txt.empty? or txt.count(" ") < min_spaces
489 candidate = header_found[PAR_REGEX]
490 break unless candidate
491 txt = candidate.ircify_html
493 txt.sub!(strip, '') if strip
494 debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
497 return txt unless txt.empty? or txt.count(" ") < min_spaces
499 # Nothing yet ... let's get drastic: we look for non-par elements too,
500 # but only for those that match something that we know is likely to
505 while txt.empty? or txt.count(" ") < min_spaces
506 candidate = header_found[AFTER_PAR1_REGEX]
507 break unless candidate
508 txt = candidate.ircify_html
510 txt.sub!(strip, '') if strip
511 debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
516 while txt.empty? or txt.count(" ") < min_spaces
517 candidate = header_found[AFTER_PAR2_REGEX]
518 break unless candidate
519 txt = candidate.ircify_html
521 txt.sub!(strip, '') if strip
522 debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
525 debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
526 return txt unless txt.count(" ") < min_spaces
531 # Get the first pars of the first _count_ _urls_.
532 # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
533 # and echoed as replies to the IRC message passed as _opts_ :message.
535 def Utils.get_first_pars(urls, count, opts={})
538 while count > 0 and urls.length > 0
542 # FIXME what happens if some big file is returned? We should share
543 # code with the url plugin to only retrieve partial file content!
544 xml = opts[:http_util].get_cached(url)
546 debug "Unable to retrieve #{url}"
549 par = Utils.ircify_first_html_par(xml, opts)
551 debug "No first par found\n#{xml}"
552 # FIXME only do this if the 'url' plugin is loaded
553 # TODO even better, put the code here
554 # par = @bot.plugins['url'].get_title_from_html(xml)
557 msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg