lib/rbot/core/utils/utils.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: rbot utilities provider
   5 #
   6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
   7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
   8 #
   9 # Copyright:: (C) 2002-2006 Tom Gilbert
  10 # Copyright:: (C) 2007 Giuseppe Bilotta
  11 #
  12 # TODO some of these Utils should be rewritten as extensions to the approriate
  13 # standard Ruby classes and accordingly be moved to extends.rb
  14
  15 require 'net/http'
  16 require 'uri'
  17 require 'tempfile'
  18
  19 begin
  20   require 'htmlentities'
  21   $we_have_html_entities_decoder = true
  22 rescue LoadError
  23   gems = require 'rubygems' rescue false
  24   if gems
  25     retry
  26   else
  27     $we_have_html_entities_decoder = false
  28     module ::Irc
  29       module Utils
  30         UNESCAPE_TABLE = {
  31     'laquo' => '<<',
  32     'raquo' => '>>',
  33     'quot' => '"',
  34     'apos' => '\'',
  35     'micro' => 'u',
  36     'copy' => '(c)',
  37     'trade' => '(tm)',
  38     'reg' => '(R)',
  39     '#174' => '(R)',
  40     '#8220' => '"',
  41     '#8221' => '"',
  42     '#8212' => '--',
  43     '#39' => '\'',
  44     'amp' => '&',
  45     'lt' => '<',
  46     'gt' => '>',
  47     'hellip' => '...',
  48     'nbsp' => ' ',
  49 =begin
  50     # extras codes, for future use...
  51     'zwnj' => '&#8204;',
  52     'aring' => '\xe5',
  53     'gt' => '>',
  54     'yen' => '\xa5',
  55     'ograve' => '\xf2',
  56     'Chi' => '&#935;',
  57     'bull' => '&#8226;',
  58     'Egrave' => '\xc8',
  59     'Ntilde' => '\xd1',
  60     'upsih' => '&#978;',
  61     'Yacute' => '\xdd',
  62     'asymp' => '&#8776;',
  63     'radic' => '&#8730;',
  64     'otimes' => '&#8855;',
  65     'nabla' => '&#8711;',
  66     'aelig' => '\xe6',
  67     'oelig' => '&#339;',
  68     'equiv' => '&#8801;',
  69     'Psi' => '&#936;',
  70     'auml' => '\xe4',
  71     'circ' => '&#710;',
  72     'Acirc' => '\xc2',
  73     'Epsilon' => '&#917;',
  74     'Yuml' => '&#376;',
  75     'Eta' => '&#919;',
  76     'Icirc' => '\xce',
  77     'Upsilon' => '&#933;',
  78     'ndash' => '&#8211;',
  79     'there4' => '&#8756;',
  80     'Prime' => '&#8243;',
  81     'prime' => '&#8242;',
  82     'psi' => '&#968;',
  83     'Kappa' => '&#922;',
  84     'rsaquo' => '&#8250;',
  85     'Tau' => '&#932;',
  86     'darr' => '&#8595;',
  87     'ocirc' => '\xf4',
  88     'lrm' => '&#8206;',
  89     'zwj' => '&#8205;',
  90     'cedil' => '\xb8',
  91     'Ecirc' => '\xca',
  92     'not' => '\xac',
  93     'AElig' => '\xc6',
  94     'oslash' => '\xf8',
  95     'acute' => '\xb4',
  96     'lceil' => '&#8968;',
  97     'shy' => '\xad',
  98     'rdquo' => '&#8221;',
  99     'ge' => '&#8805;',
 100     'Igrave' => '\xcc',
 101     'Ograve' => '\xd2',
 102     'euro' => '&#8364;',
 103     'dArr' => '&#8659;',
 104     'sdot' => '&#8901;',
 105     'nbsp' => '\xa0',
 106     'lfloor' => '&#8970;',
 107     'lArr' => '&#8656;',
 108     'Auml' => '\xc4',
 109     'larr' => '&#8592;',
 110     'Atilde' => '\xc3',
 111     'Otilde' => '\xd5',
 112     'szlig' => '\xdf',
 113     'clubs' => '&#9827;',
 114     'diams' => '&#9830;',
 115     'agrave' => '\xe0',
 116     'Ocirc' => '\xd4',
 117     'Iota' => '&#921;',
 118     'Theta' => '&#920;',
 119     'Pi' => '&#928;',
 120     'OElig' => '&#338;',
 121     'Scaron' => '&#352;',
 122     'frac14' => '\xbc',
 123     'egrave' => '\xe8',
 124     'sub' => '&#8834;',
 125     'iexcl' => '\xa1',
 126     'frac12' => '\xbd',
 127     'sbquo' => '&#8218;',
 128     'ordf' => '\xaa',
 129     'sum' => '&#8721;',
 130     'prop' => '&#8733;',
 131     'Uuml' => '\xdc',
 132     'ntilde' => '\xf1',
 133     'sup' => '&#8835;',
 134     'theta' => '&#952;',
 135     'prod' => '&#8719;',
 136     'nsub' => '&#8836;',
 137     'hArr' => '&#8660;',
 138     'rlm' => '&#8207;',
 139     'THORN' => '\xde',
 140     'infin' => '&#8734;',
 141     'yuml' => '\xff',
 142     'Mu' => '&#924;',
 143     'le' => '&#8804;',
 144     'Eacute' => '\xc9',
 145     'thinsp' => '&#8201;',
 146     'ecirc' => '\xea',
 147     'bdquo' => '&#8222;',
 148     'Sigma' => '&#931;',
 149     'fnof' => '&#402;',
 150     'Aring' => '\xc5',
 151     'tilde' => '&#732;',
 152     'frac34' => '\xbe',
 153     'emsp' => '&#8195;',
 154     'mdash' => '&#8212;',
 155     'uarr' => '&#8593;',
 156     'permil' => '&#8240;',
 157     'Ugrave' => '\xd9',
 158     'rarr' => '&#8594;',
 159     'Agrave' => '\xc0',
 160     'chi' => '&#967;',
 161     'forall' => '&#8704;',
 162     'eth' => '\xf0',
 163     'rceil' => '&#8969;',
 164     'iuml' => '\xef',
 165     'gamma' => '&#947;',
 166     'lambda' => '&#955;',
 167     'harr' => '&#8596;',
 168     'rang' => '&#9002;',
 169     'xi' => '&#958;',
 170     'dagger' => '&#8224;',
 171     'divide' => '\xf7',
 172     'Ouml' => '\xd6',
 173     'image' => '&#8465;',
 174     'alefsym' => '&#8501;',
 175     'igrave' => '\xec',
 176     'otilde' => '\xf5',
 177     'Oacute' => '\xd3',
 178     'sube' => '&#8838;',
 179     'alpha' => '&#945;',
 180     'frasl' => '&#8260;',
 181     'ETH' => '\xd0',
 182     'lowast' => '&#8727;',
 183     'Nu' => '&#925;',
 184     'plusmn' => '\xb1',
 185     'Euml' => '\xcb',
 186     'real' => '&#8476;',
 187     'sup1' => '\xb9',
 188     'sup2' => '\xb2',
 189     'sup3' => '\xb3',
 190     'Oslash' => '\xd8',
 191     'Aacute' => '\xc1',
 192     'cent' => '\xa2',
 193     'oline' => '&#8254;',
 194     'Beta' => '&#914;',
 195     'perp' => '&#8869;',
 196     'Delta' => '&#916;',
 197     'loz' => '&#9674;',
 198     'pi' => '&#960;',
 199     'iota' => '&#953;',
 200     'empty' => '&#8709;',
 201     'euml' => '\xeb',
 202     'brvbar' => '\xa6',
 203     'iacute' => '\xed',
 204     'para' => '\xb6',
 205     'micro' => '\xb5',
 206     'cup' => '&#8746;',
 207     'weierp' => '&#8472;',
 208     'uuml' => '\xfc',
 209     'part' => '&#8706;',
 210     'icirc' => '\xee',
 211     'delta' => '&#948;',
 212     'omicron' => '&#959;',
 213     'upsilon' => '&#965;',
 214     'Iuml' => '\xcf',
 215     'Lambda' => '&#923;',
 216     'Xi' => '&#926;',
 217     'kappa' => '&#954;',
 218     'ccedil' => '\xe7',
 219     'Ucirc' => '\xdb',
 220     'cap' => '&#8745;',
 221     'mu' => '&#956;',
 222     'scaron' => '&#353;',
 223     'lsquo' => '&#8216;',
 224     'isin' => '&#8712;',
 225     'Zeta' => '&#918;',
 226     'supe' => '&#8839;',
 227     'deg' => '\xb0',
 228     'and' => '&#8743;',
 229     'tau' => '&#964;',
 230     'pound' => '\xa3',
 231     'hellip' => '&#8230;',
 232     'curren' => '\xa4',
 233     'int' => '&#8747;',
 234     'ucirc' => '\xfb',
 235     'rfloor' => '&#8971;',
 236     'ensp' => '&#8194;',
 237     'crarr' => '&#8629;',
 238     'ugrave' => '\xf9',
 239     'notin' => '&#8713;',
 240     'exist' => '&#8707;',
 241     'uArr' => '&#8657;',
 242     'cong' => '&#8773;',
 243     'Dagger' => '&#8225;',
 244     'oplus' => '&#8853;',
 245     'times' => '\xd7',
 246     'atilde' => '\xe3',
 247     'piv' => '&#982;',
 248     'ni' => '&#8715;',
 249     'Phi' => '&#934;',
 250     'lsaquo' => '&#8249;',
 251     'Uacute' => '\xda',
 252     'Omicron' => '&#927;',
 253     'ang' => '&#8736;',
 254     'ne' => '&#8800;',
 255     'iquest' => '\xbf',
 256     'eta' => '&#951;',
 257     'yacute' => '\xfd',
 258     'Rho' => '&#929;',
 259     'uacute' => '\xfa',
 260     'Alpha' => '&#913;',
 261     'zeta' => '&#950;',
 262     'Omega' => '&#937;',
 263     'nu' => '&#957;',
 264     'sim' => '&#8764;',
 265     'sect' => '\xa7',
 266     'phi' => '&#966;',
 267     'sigmaf' => '&#962;',
 268     'macr' => '\xaf',
 269     'minus' => '&#8722;',
 270     'Ccedil' => '\xc7',
 271     'ordm' => '\xba',
 272     'epsilon' => '&#949;',
 273     'beta' => '&#946;',
 274     'rArr' => '&#8658;',
 275     'rho' => '&#961;',
 276     'aacute' => '\xe1',
 277     'eacute' => '\xe9',
 278     'omega' => '&#969;',
 279     'middot' => '\xb7',
 280     'Gamma' => '&#915;',
 281     'Iacute' => '\xcd',
 282     'lang' => '&#9001;',
 283     'spades' => '&#9824;',
 284     'rsquo' => '&#8217;',
 285     'uml' => '\xa8',
 286     'thorn' => '\xfe',
 287     'ouml' => '\xf6',
 288     'thetasym' => '&#977;',
 289     'or' => '&#8744;',
 290     'raquo' => '\xbb',
 291     'acirc' => '\xe2',
 292     'ldquo' => '&#8220;',
 293     'hearts' => '&#9829;',
 294     'sigma' => '&#963;',
 295     'oacute' => '\xf3',
 296 =end
 297         }
 298       end
 299     end
 300   end
 301 end
 302
 303
 304 module ::Irc
 305
 306   # miscellaneous useful functions
 307   module Utils
 308     SEC_PER_MIN = 60
 309     SEC_PER_HR = SEC_PER_MIN * 60
 310     SEC_PER_DAY = SEC_PER_HR * 24
 311     SEC_PER_MNTH = SEC_PER_DAY * 30
 312     SEC_PER_YR = SEC_PER_MNTH * 12
 313
 314     def Utils.secs_to_string_case(array, var, string, plural)
 315       case var
 316       when 1
 317         array << "1 #{string}"
 318       else
 319         array << "#{var} #{plural}"
 320       end
 321     end
 322
 323     # turn a number of seconds into a human readable string, e.g
 324     # 2 days, 3 hours, 18 minutes, 10 seconds
 325     def Utils.secs_to_string(secs)
 326       ret = []
 327       years, secs = secs.divmod SEC_PER_YR
 328       secs_to_string_case(ret, years, "year", "years") if years > 0
 329       months, secs = secs.divmod SEC_PER_MNTH
 330       secs_to_string_case(ret, months, "month", "months") if months > 0
 331       days, secs = secs.divmod SEC_PER_DAY
 332       secs_to_string_case(ret, days, "day", "days") if days > 0
 333       hours, secs = secs.divmod SEC_PER_HR
 334       secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
 335       mins, secs = secs.divmod SEC_PER_MIN
 336       secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
 337       secs = secs.to_i
 338       secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
 339       case ret.length
 340       when 0
 341         raise "Empty ret array!"
 342       when 1
 343         return ret.to_s
 344       else
 345         return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
 346       end
 347     end
 348
 349
 350     def Utils.safe_exec(command, *args)
 351       IO.popen("-") {|p|
 352         if(p)
 353           return p.readlines.join("\n")
 354         else
 355           begin
 356             $stderr = $stdout
 357             exec(command, *args)
 358           rescue Exception => e
 359             puts "exec of #{command} led to exception: #{e.inspect}"
 360             Kernel::exit! 0
 361           end
 362           puts "exec of #{command} failed"
 363           Kernel::exit! 0
 364         end
 365       }
 366     end
 367
 368
 369     @@safe_save_dir = nil unless defined?(@@safe_save_dir)
 370     def Utils.set_safe_save_dir(str)
 371       @@safe_save_dir = str.dup
 372     end
 373
 374     def Utils.safe_save(file)
 375       raise 'No safe save directory defined!' if @@safe_save_dir.nil?
 376       basename = File.basename(file)
 377       temp = Tempfile.new(basename,@@safe_save_dir)
 378       temp.binmode
 379       yield temp if block_given?
 380       temp.close
 381       File.rename(temp.path, file)
 382     end
 383
 384
 385     # returns a string containing the result of an HTTP GET on the uri
 386     def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
 387
 388       # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
 389       Net::HTTP.version_1_2
 390       # (so we support the 1_1 api anyway, avoids problems)
 391
 392       uri = URI.parse uristr
 393       query = uri.path
 394       if uri.query
 395         query += "?#{uri.query}"
 396       end
 397
 398       proxy_host = nil
 399       proxy_port = nil
 400       if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
 401         proxy_host = proxy_uri.host
 402         proxy_port = proxy_uri.port
 403       end
 404
 405       begin
 406         http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
 407         http.open_timeout = opentimeout
 408         http.read_timeout = readtimeout
 409
 410         http.start {|http|
 411           resp = http.get(query)
 412           if resp.code == "200"
 413             return resp.body
 414           end
 415         }
 416       rescue => e
 417         # cheesy for now
 418         error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
 419         return nil
 420       end
 421     end
 422
 423     def Utils.decode_html_entities(str)
 424       if $we_have_html_entities_decoder
 425         return HTMLEntities.decode_entities(str)
 426       else
 427         str.gsub(/(&(.+?);)/) {
 428           symbol = $2
 429           # remove the 0-paddng from unicode integers
 430           if symbol =~ /#(.+)/
 431             symbol = "##{$1.to_i.to_s}"
 432           end
 433
 434           # output the symbol's irc-translated character, or a * if it's unknown
 435           UNESCAPE_TABLE[symbol] || [symbol[/\d+/].to_i].pack("U") rescue '*'
 436         }
 437       end
 438     end
 439
 440     HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
 441     PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 442
 443     # Some blogging and forum platforms use spans or divs with a 'body' in their class
 444     # to mark actual text
 445     AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 446
 447     # At worst, we can try stuff which is comprised between two <br>
 448     AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 449
 450     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
 451     # If possible, grab the one after the first heading
 452     #
 453     # It is possible to pass some options to determine how the stripping
 454     # occurs. Currently supported options are
 455     #   * :strip => Regex or String to strip at the beginning of the obtained
 456     #               text
 457     #   * :min_spaces => Minimum number of spaces a paragraph should have
 458     #
 459     def Utils.ircify_first_html_par(xml, opts={})
 460       txt = String.new
 461
 462       strip = opts[:strip]
 463       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 464
 465       min_spaces = opts[:min_spaces] || 8
 466       min_spaces = 0 if min_spaces < 0
 467
 468       while true
 469         debug "Minimum number of spaces: #{min_spaces}"
 470         header_found = xml.match(HX_REGEX)
 471         if header_found
 472           header_found = $'
 473           while txt.empty? or txt.count(" ") < min_spaces
 474             candidate = header_found[PAR_REGEX]
 475             break unless candidate
 476             txt = candidate.ircify_html
 477             header_found = $'
 478             txt.sub!(strip, '') if strip
 479             debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 480           end
 481         end
 482
 483         return txt unless txt.empty? or txt.count(" ") < min_spaces
 484
 485         # If we haven't found a first par yet, try to get it from the whole
 486         # document
 487         header_found = xml
 488         while txt.empty? or txt.count(" ") < min_spaces
 489           candidate = header_found[PAR_REGEX]
 490           break unless candidate
 491           txt = candidate.ircify_html
 492           header_found = $'
 493           txt.sub!(strip, '') if strip
 494           debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 495         end
 496
 497         return txt unless txt.empty? or txt.count(" ") < min_spaces
 498
 499         # Nothing yet ... let's get drastic: we look for non-par elements too,
 500         # but only for those that match something that we know is likely to
 501         # contain text
 502
 503         # Attempt #1
 504         header_found = xml
 505         while txt.empty? or txt.count(" ") < min_spaces
 506           candidate = header_found[AFTER_PAR1_REGEX]
 507           break unless candidate
 508           txt = candidate.ircify_html
 509           header_found = $'
 510           txt.sub!(strip, '') if strip
 511           debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
 512         end
 513
 514         # Attempt #2
 515         header_found = xml
 516         while txt.empty? or txt.count(" ") < min_spaces
 517           candidate = header_found[AFTER_PAR2_REGEX]
 518           break unless candidate
 519           txt = candidate.ircify_html
 520           header_found = $'
 521           txt.sub!(strip, '') if strip
 522           debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
 523         end
 524
 525         debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
 526         return txt unless txt.count(" ") < min_spaces
 527         min_spaces /= 2
 528       end
 529     end
 530
 531     # Get the first pars of the first _count_ _urls_.
 532     # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
 533     # and echoed as replies to the IRC message passed as _opts_ :message.
 534     #
 535     def Utils.get_first_pars(urls, count, opts={})
 536       idx = 0
 537       msg = opts[:message]
 538       while count > 0 and urls.length > 0
 539         url = urls.shift
 540         idx += 1
 541
 542         # FIXME what happens if some big file is returned? We should share
 543         # code with the url plugin to only retrieve partial file content!
 544         xml = opts[:http_util].get_cached(url)
 545         if xml.nil?
 546           debug "Unable to retrieve #{url}"
 547           next
 548         end
 549         par = Utils.ircify_first_html_par(xml, opts)
 550         if par.empty?
 551           debug "No first par found\n#{xml}"
 552           # FIXME only do this if the 'url' plugin is loaded
 553           # TODO even better, put the code here
 554           # par = @bot.plugins['url'].get_title_from_html(xml)
 555           next if par.empty?
 556         end
 557         msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
 558         count -=1
 559       end
 560     end
 561
 562
 563   end
 564 end