4 # :title: rbot utilities provider
6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
9 # Copyright:: (C) 2002-2006 Tom Gilbert
10 # Copyright:: (C) 2007 Giuseppe Bilotta
12 # TODO some of these Utils should be rewritten as extensions to the approriate
13 # standard Ruby classes and accordingly be moved to extends.rb
20 require 'htmlentities'
21 $we_have_html_entities_decoder = true
23 if require 'rubygems' rescue false
26 $we_have_html_entities_decoder = false
49 # extras codes, for future use...
63 'otimes' => '⊗',
72 'Epsilon' => 'Ε',
76 'Upsilon' => 'Υ',
78 'there4' => '∴',
83 'rsaquo' => '›',
105 'lfloor' => '⌊',
112 'clubs' => '♣',
113 'diams' => '♦',
120 'Scaron' => 'Š',
126 'sbquo' => '‚',
139 'infin' => '∞',
144 'thinsp' => ' ',
146 'bdquo' => '„',
153 'mdash' => '—',
155 'permil' => '‰',
160 'forall' => '∀',
162 'rceil' => '⌉',
165 'lambda' => 'λ',
169 'dagger' => '†',
172 'image' => 'ℑ',
173 'alefsym' => 'ℵ',
179 'frasl' => '⁄',
181 'lowast' => '∗',
192 'oline' => '‾',
199 'empty' => '∅',
206 'weierp' => '℘',
211 'omicron' => 'ο',
212 'upsilon' => 'υ',
214 'Lambda' => 'Λ',
221 'scaron' => 'š',
222 'lsquo' => '‘',
230 'hellip' => '…',
234 'rfloor' => '⌋',
236 'crarr' => '↵',
238 'notin' => '∉',
239 'exist' => '∃',
242 'Dagger' => '‡',
243 'oplus' => '⊕',
249 'lsaquo' => '‹',
251 'Omicron' => 'Ο',
266 'sigmaf' => 'ς',
268 'minus' => '−',
271 'epsilon' => 'ε',
282 'spades' => '♠',
283 'rsquo' => '’',
287 'thetasym' => 'ϑ',
291 'ldquo' => '“',
292 'hearts' => '♥',
305 # miscellaneous useful functions
308 SEC_PER_HR = SEC_PER_MIN * 60
309 SEC_PER_DAY = SEC_PER_HR * 24
310 SEC_PER_MNTH = SEC_PER_DAY * 30
311 SEC_PER_YR = SEC_PER_MNTH * 12
313 def Utils.secs_to_string_case(array, var, string, plural)
316 array << "1 #{string}"
318 array << "#{var} #{plural}"
322 # turn a number of seconds into a human readable string, e.g
323 # 2 days, 3 hours, 18 minutes, 10 seconds
324 def Utils.secs_to_string(secs)
326 years, secs = secs.divmod SEC_PER_YR
327 secs_to_string_case(ret, years, "year", "years") if years > 0
328 months, secs = secs.divmod SEC_PER_MNTH
329 secs_to_string_case(ret, months, "month", "months") if months > 0
330 days, secs = secs.divmod SEC_PER_DAY
331 secs_to_string_case(ret, days, "day", "days") if days > 0
332 hours, secs = secs.divmod SEC_PER_HR
333 secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
334 mins, secs = secs.divmod SEC_PER_MIN
335 secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
337 secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
340 raise "Empty ret array!"
344 return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
349 def Utils.safe_exec(command, *args)
352 return p.readlines.join("\n")
357 rescue Exception => e
358 puts "exec of #{command} led to exception: #{e.inspect}"
361 puts "exec of #{command} failed"
368 @@safe_save_dir = nil unless defined?(@@safe_save_dir)
369 def Utils.set_safe_save_dir(str)
370 @@safe_save_dir = str.dup
373 def Utils.safe_save(file)
374 raise 'No safe save directory defined!' if @@safe_save_dir.nil?
375 basename = File.basename(file)
376 temp = Tempfile.new(basename,@@safe_save_dir)
378 yield temp if block_given?
380 File.rename(temp.path, file)
384 # returns a string containing the result of an HTTP GET on the uri
385 def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
387 # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
388 Net::HTTP.version_1_2
389 # (so we support the 1_1 api anyway, avoids problems)
391 uri = URI.parse uristr
394 query += "?#{uri.query}"
399 if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
400 proxy_host = proxy_uri.host
401 proxy_port = proxy_uri.port
405 http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
406 http.open_timeout = opentimeout
407 http.read_timeout = readtimeout
410 resp = http.get(query)
411 if resp.code == "200"
417 error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
422 def Utils.decode_html_entities(str)
423 if $we_have_html_entities_decoder
424 return HTMLEntities.decode_entities(str)
426 str.gsub(/(&(.+?);)/) {
428 # remove the 0-paddng from unicode integers
430 symbol = "##{$1.to_i.to_s}"
433 # output the symbol's irc-translated character, or a * if it's unknown
434 UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
439 HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
440 PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
442 # Some blogging and forum platforms use spans or divs with a 'body' in their class
443 # to mark actual text
444 AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
446 # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
447 # If possible, grab the one after the first heading
449 # It is possible to pass some options to determine how the stripping
450 # occurs. Currently supported options are
451 # * :strip => Regex or String to strip at the beginning of the obtained
453 # * :min_spaces => Minimum number of spaces a paragraph should have
455 def Utils.ircify_first_html_par(xml, opts={})
459 strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
461 min_spaces = opts[:min_spaces] || 8
462 min_spaces = 0 if min_spaces < 0
465 debug "Minimum number of spaces: #{min_spaces}"
466 header_found = xml.match(HX_REGEX)
469 while txt.empty? or txt.count(" ") < min_spaces
470 candidate = header_found[PAR_REGEX]
471 break unless candidate
472 txt = candidate.ircify_html
474 txt.sub!(strip, '') if strip
475 debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
479 return txt unless txt.empty? or txt.count(" ") < min_spaces
481 # If we haven't found a first par yet, try to get it from the whole
484 while txt.empty? or txt.count(" ") < min_spaces
485 candidate = header_found[PAR_REGEX]
486 break unless candidate
487 txt = candidate.ircify_html
489 txt.sub!(strip, '') if strip
490 debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
493 return txt unless txt.empty? or txt.count(" ") < min_spaces
495 # Nothing yet ... let's get drastic: we look for non-par elements too,
496 # but only for those that match something that we know is likely to
499 while txt.empty? or txt.count(" ") < min_spaces
500 candidate = header_found[AFTER_PAR1_REGEX]
501 break unless candidate
502 txt = candidate.ircify_html
504 txt.sub!(strip, '') if strip
505 debug "(other attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
508 debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
509 return txt unless txt.count(" ") < min_spaces
514 # Get the first pars of the first _count_ _urls_.
515 # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
516 # and echoed as replies to the IRC message passed as _opts_ :message.
518 def Utils.get_first_pars(urls, count, opts={})
521 while count > 0 and urls.length > 0
525 # FIXME what happens if some big file is returned? We should share
526 # code with the url plugin to only retrieve partial file content!
527 xml = opts[:http_util].get_cached(url)
529 debug "Unable to retrieve #{url}"
532 par = Utils.ircify_first_html_par(xml, opts)
534 debug "No first par found\n#{xml}"
535 # FIXME only do this if the 'url' plugin is loaded
536 # TODO even better, put the code here
537 # par = @bot.plugins['url'].get_title_from_html(xml)
540 msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg