lib/rbot/core/utils/utils.rb

   1 require 'net/http'
   2 require 'uri'
   3 require 'tempfile'
   4
   5 begin
   6   $we_have_html_entities_decoder = require 'htmlentities'
   7 rescue LoadError
   8   $we_have_html_entities_decoder = false
   9   module ::Irc
  10     module Utils
  11       UNESCAPE_TABLE = {
  12     'laquo' => '<<',
  13     'raquo' => '>>',
  14     'quot' => '"',
  15     'apos' => '\'',
  16     'micro' => 'u',
  17     'copy' => '(c)',
  18     'trade' => '(tm)',
  19     'reg' => '(R)',
  20     '#174' => '(R)',
  21     '#8220' => '"',
  22     '#8221' => '"',
  23     '#8212' => '--',
  24     '#39' => '\'',
  25     'amp' => '&',
  26     'lt' => '<',
  27     'gt' => '>',
  28     'hellip' => '...',
  29     'nbsp' => ' ',
  30 =begin
  31     # extras codes, for future use...
  32     'zwnj' => '&#8204;',
  33     'aring' => '\xe5',
  34     'gt' => '>',
  35     'yen' => '\xa5',
  36     'ograve' => '\xf2',
  37     'Chi' => '&#935;',
  38     'bull' => '&#8226;',
  39     'Egrave' => '\xc8',
  40     'Ntilde' => '\xd1',
  41     'upsih' => '&#978;',
  42     'Yacute' => '\xdd',
  43     'asymp' => '&#8776;',
  44     'radic' => '&#8730;',
  45     'otimes' => '&#8855;',
  46     'nabla' => '&#8711;',
  47     'aelig' => '\xe6',
  48     'oelig' => '&#339;',
  49     'equiv' => '&#8801;',
  50     'Psi' => '&#936;',
  51     'auml' => '\xe4',
  52     'circ' => '&#710;',
  53     'Acirc' => '\xc2',
  54     'Epsilon' => '&#917;',
  55     'Yuml' => '&#376;',
  56     'Eta' => '&#919;',
  57     'Icirc' => '\xce',
  58     'Upsilon' => '&#933;',
  59     'ndash' => '&#8211;',
  60     'there4' => '&#8756;',
  61     'Prime' => '&#8243;',
  62     'prime' => '&#8242;',
  63     'psi' => '&#968;',
  64     'Kappa' => '&#922;',
  65     'rsaquo' => '&#8250;',
  66     'Tau' => '&#932;',
  67     'darr' => '&#8595;',
  68     'ocirc' => '\xf4',
  69     'lrm' => '&#8206;',
  70     'zwj' => '&#8205;',
  71     'cedil' => '\xb8',
  72     'Ecirc' => '\xca',
  73     'not' => '\xac',
  74     'AElig' => '\xc6',
  75     'oslash' => '\xf8',
  76     'acute' => '\xb4',
  77     'lceil' => '&#8968;',
  78     'shy' => '\xad',
  79     'rdquo' => '&#8221;',
  80     'ge' => '&#8805;',
  81     'Igrave' => '\xcc',
  82     'Ograve' => '\xd2',
  83     'euro' => '&#8364;',
  84     'dArr' => '&#8659;',
  85     'sdot' => '&#8901;',
  86     'nbsp' => '\xa0',
  87     'lfloor' => '&#8970;',
  88     'lArr' => '&#8656;',
  89     'Auml' => '\xc4',
  90     'larr' => '&#8592;',
  91     'Atilde' => '\xc3',
  92     'Otilde' => '\xd5',
  93     'szlig' => '\xdf',
  94     'clubs' => '&#9827;',
  95     'diams' => '&#9830;',
  96     'agrave' => '\xe0',
  97     'Ocirc' => '\xd4',
  98     'Iota' => '&#921;',
  99     'Theta' => '&#920;',
 100     'Pi' => '&#928;',
 101     'OElig' => '&#338;',
 102     'Scaron' => '&#352;',
 103     'frac14' => '\xbc',
 104     'egrave' => '\xe8',
 105     'sub' => '&#8834;',
 106     'iexcl' => '\xa1',
 107     'frac12' => '\xbd',
 108     'sbquo' => '&#8218;',
 109     'ordf' => '\xaa',
 110     'sum' => '&#8721;',
 111     'prop' => '&#8733;',
 112     'Uuml' => '\xdc',
 113     'ntilde' => '\xf1',
 114     'sup' => '&#8835;',
 115     'theta' => '&#952;',
 116     'prod' => '&#8719;',
 117     'nsub' => '&#8836;',
 118     'hArr' => '&#8660;',
 119     'rlm' => '&#8207;',
 120     'THORN' => '\xde',
 121     'infin' => '&#8734;',
 122     'yuml' => '\xff',
 123     'Mu' => '&#924;',
 124     'le' => '&#8804;',
 125     'Eacute' => '\xc9',
 126     'thinsp' => '&#8201;',
 127     'ecirc' => '\xea',
 128     'bdquo' => '&#8222;',
 129     'Sigma' => '&#931;',
 130     'fnof' => '&#402;',
 131     'Aring' => '\xc5',
 132     'tilde' => '&#732;',
 133     'frac34' => '\xbe',
 134     'emsp' => '&#8195;',
 135     'mdash' => '&#8212;',
 136     'uarr' => '&#8593;',
 137     'permil' => '&#8240;',
 138     'Ugrave' => '\xd9',
 139     'rarr' => '&#8594;',
 140     'Agrave' => '\xc0',
 141     'chi' => '&#967;',
 142     'forall' => '&#8704;',
 143     'eth' => '\xf0',
 144     'rceil' => '&#8969;',
 145     'iuml' => '\xef',
 146     'gamma' => '&#947;',
 147     'lambda' => '&#955;',
 148     'harr' => '&#8596;',
 149     'rang' => '&#9002;',
 150     'xi' => '&#958;',
 151     'dagger' => '&#8224;',
 152     'divide' => '\xf7',
 153     'Ouml' => '\xd6',
 154     'image' => '&#8465;',
 155     'alefsym' => '&#8501;',
 156     'igrave' => '\xec',
 157     'otilde' => '\xf5',
 158     'Oacute' => '\xd3',
 159     'sube' => '&#8838;',
 160     'alpha' => '&#945;',
 161     'frasl' => '&#8260;',
 162     'ETH' => '\xd0',
 163     'lowast' => '&#8727;',
 164     'Nu' => '&#925;',
 165     'plusmn' => '\xb1',
 166     'Euml' => '\xcb',
 167     'real' => '&#8476;',
 168     'sup1' => '\xb9',
 169     'sup2' => '\xb2',
 170     'sup3' => '\xb3',
 171     'Oslash' => '\xd8',
 172     'Aacute' => '\xc1',
 173     'cent' => '\xa2',
 174     'oline' => '&#8254;',
 175     'Beta' => '&#914;',
 176     'perp' => '&#8869;',
 177     'Delta' => '&#916;',
 178     'loz' => '&#9674;',
 179     'pi' => '&#960;',
 180     'iota' => '&#953;',
 181     'empty' => '&#8709;',
 182     'euml' => '\xeb',
 183     'brvbar' => '\xa6',
 184     'iacute' => '\xed',
 185     'para' => '\xb6',
 186     'micro' => '\xb5',
 187     'cup' => '&#8746;',
 188     'weierp' => '&#8472;',
 189     'uuml' => '\xfc',
 190     'part' => '&#8706;',
 191     'icirc' => '\xee',
 192     'delta' => '&#948;',
 193     'omicron' => '&#959;',
 194     'upsilon' => '&#965;',
 195     'Iuml' => '\xcf',
 196     'Lambda' => '&#923;',
 197     'Xi' => '&#926;',
 198     'kappa' => '&#954;',
 199     'ccedil' => '\xe7',
 200     'Ucirc' => '\xdb',
 201     'cap' => '&#8745;',
 202     'mu' => '&#956;',
 203     'scaron' => '&#353;',
 204     'lsquo' => '&#8216;',
 205     'isin' => '&#8712;',
 206     'Zeta' => '&#918;',
 207     'supe' => '&#8839;',
 208     'deg' => '\xb0',
 209     'and' => '&#8743;',
 210     'tau' => '&#964;',
 211     'pound' => '\xa3',
 212     'hellip' => '&#8230;',
 213     'curren' => '\xa4',
 214     'int' => '&#8747;',
 215     'ucirc' => '\xfb',
 216     'rfloor' => '&#8971;',
 217     'ensp' => '&#8194;',
 218     'crarr' => '&#8629;',
 219     'ugrave' => '\xf9',
 220     'notin' => '&#8713;',
 221     'exist' => '&#8707;',
 222     'uArr' => '&#8657;',
 223     'cong' => '&#8773;',
 224     'Dagger' => '&#8225;',
 225     'oplus' => '&#8853;',
 226     'times' => '\xd7',
 227     'atilde' => '\xe3',
 228     'piv' => '&#982;',
 229     'ni' => '&#8715;',
 230     'Phi' => '&#934;',
 231     'lsaquo' => '&#8249;',
 232     'Uacute' => '\xda',
 233     'Omicron' => '&#927;',
 234     'ang' => '&#8736;',
 235     'ne' => '&#8800;',
 236     'iquest' => '\xbf',
 237     'eta' => '&#951;',
 238     'yacute' => '\xfd',
 239     'Rho' => '&#929;',
 240     'uacute' => '\xfa',
 241     'Alpha' => '&#913;',
 242     'zeta' => '&#950;',
 243     'Omega' => '&#937;',
 244     'nu' => '&#957;',
 245     'sim' => '&#8764;',
 246     'sect' => '\xa7',
 247     'phi' => '&#966;',
 248     'sigmaf' => '&#962;',
 249     'macr' => '\xaf',
 250     'minus' => '&#8722;',
 251     'Ccedil' => '\xc7',
 252     'ordm' => '\xba',
 253     'epsilon' => '&#949;',
 254     'beta' => '&#946;',
 255     'rArr' => '&#8658;',
 256     'rho' => '&#961;',
 257     'aacute' => '\xe1',
 258     'eacute' => '\xe9',
 259     'omega' => '&#969;',
 260     'middot' => '\xb7',
 261     'Gamma' => '&#915;',
 262     'Iacute' => '\xcd',
 263     'lang' => '&#9001;',
 264     'spades' => '&#9824;',
 265     'rsquo' => '&#8217;',
 266     'uml' => '\xa8',
 267     'thorn' => '\xfe',
 268     'ouml' => '\xf6',
 269     'thetasym' => '&#977;',
 270     'or' => '&#8744;',
 271     'raquo' => '\xbb',
 272     'acirc' => '\xe2',
 273     'ldquo' => '&#8220;',
 274     'hearts' => '&#9829;',
 275     'sigma' => '&#963;',
 276     'oacute' => '\xf3',
 277 =end
 278       }
 279     end
 280   end
 281 end
 282
 283
 284 module ::Irc
 285
 286   # miscellaneous useful functions
 287   module Utils
 288     SEC_PER_MIN = 60
 289     SEC_PER_HR = SEC_PER_MIN * 60
 290     SEC_PER_DAY = SEC_PER_HR * 24
 291     SEC_PER_MNTH = SEC_PER_DAY * 30
 292     SEC_PER_YR = SEC_PER_MNTH * 12
 293
 294     def Utils.secs_to_string_case(array, var, string, plural)
 295       case var
 296       when 1
 297         array << "1 #{string}"
 298       else
 299         array << "#{var} #{plural}"
 300       end
 301     end
 302
 303     # turn a number of seconds into a human readable string, e.g
 304     # 2 days, 3 hours, 18 minutes, 10 seconds
 305     def Utils.secs_to_string(secs)
 306       ret = []
 307       years, secs = secs.divmod SEC_PER_YR
 308       secs_to_string_case(ret, years, "year", "years") if years > 0
 309       months, secs = secs.divmod SEC_PER_MNTH
 310       secs_to_string_case(ret, months, "month", "months") if months > 0
 311       days, secs = secs.divmod SEC_PER_DAY
 312       secs_to_string_case(ret, days, "day", "days") if days > 0
 313       hours, secs = secs.divmod SEC_PER_HR
 314       secs_to_string_case(ret, hours, "hour", "hours") if hours > 0
 315       mins, secs = secs.divmod SEC_PER_MIN
 316       secs_to_string_case(ret, mins, "minute", "minutes") if mins > 0
 317       secs_to_string_case(ret, secs, "second", "seconds") if secs > 0 or ret.empty?
 318       case ret.length
 319       when 0
 320         raise "Empty ret array!"
 321       when 1
 322         return ret.to_s
 323       else
 324         return [ret[0, ret.length-1].join(", ") , ret[-1]].join(" and ")
 325       end
 326     end
 327
 328
 329     def Utils.safe_exec(command, *args)
 330       IO.popen("-") {|p|
 331         if(p)
 332           return p.readlines.join("\n")
 333         else
 334           begin
 335             $stderr = $stdout
 336             exec(command, *args)
 337           rescue Exception => e
 338             puts "exec of #{command} led to exception: #{e.inspect}"
 339             Kernel::exit! 0
 340           end
 341           puts "exec of #{command} failed"
 342           Kernel::exit! 0
 343         end
 344       }
 345     end
 346
 347
 348     @@safe_save_dir = nil
 349     def Utils.set_safe_save_dir(str)
 350       @@safe_save_dir = str.dup
 351     end
 352
 353     def Utils.safe_save(file)
 354       raise 'No safe save directory defined!' if @@safe_save_dir.nil?
 355       basename = File.basename(file)
 356       temp = Tempfile.new(basename,@@safe_save_dir)
 357       temp.binmode
 358       yield temp if block_given?
 359       temp.close
 360       File.rename(temp.path, file)
 361     end
 362
 363
 364     # returns a string containing the result of an HTTP GET on the uri
 365     def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
 366
 367       # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
 368       Net::HTTP.version_1_2
 369       # (so we support the 1_1 api anyway, avoids problems)
 370
 371       uri = URI.parse uristr
 372       query = uri.path
 373       if uri.query
 374         query += "?#{uri.query}"
 375       end
 376
 377       proxy_host = nil
 378       proxy_port = nil
 379       if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
 380         proxy_host = proxy_uri.host
 381         proxy_port = proxy_uri.port
 382       end
 383
 384       begin
 385         http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
 386         http.open_timeout = opentimeout
 387         http.read_timeout = readtimeout
 388
 389         http.start {|http|
 390           resp = http.get(query)
 391           if resp.code == "200"
 392             return resp.body
 393           end
 394         }
 395       rescue => e
 396         # cheesy for now
 397         error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
 398         return nil
 399       end
 400     end
 401
 402     def Utils.decode_html_entities(str)
 403       if $we_have_html_entities_decoder
 404         return HTMLEntities.decode_entities(str)
 405       else
 406         str.gsub(/(&(.+?);)/) {
 407           symbol = $2
 408           # remove the 0-paddng from unicode integers
 409           if symbol =~ /#(.+)/
 410             symbol = "##{$1.to_i.to_s}"
 411           end
 412
 413           # output the symbol's irc-translated character, or a * if it's unknown
 414           UNESCAPE_TABLE[symbol] || '*'
 415         }
 416       end
 417     end
 418
 419     H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
 420     PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
 421     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
 422     # If possible, grab the one after the first h1 heading
 423     def Utils.ircify_first_html_par(xml)
 424       header_found = xml.match(H1_REGEX)
 425       txt = String.new
 426       if header_found
 427         debug "Found header: #{header_found[1].inspect}"
 428         while txt.empty?
 429           header_found = $'
 430           candidate = header_found[PAR_REGEX]
 431           break unless candidate
 432           txt = candidate.ircify_html
 433         end
 434       end
 435       # If we haven't found a first par yet, try to get it from the whole
 436       # document
 437       if txt.empty?
 438         header_found = xml
 439         while txt.empty?
 440           candidate = header_found[PAR_REGEX]
 441           break unless candidate
 442           txt = candidate.ircify_html
 443           header_found = $'
 444         end
 445       end
 446       return txt
 447     end
 448
 449     # Get the first pars of the first _count_ _urls_.
 450     # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
 451     # and echoed as replies to the IRC message passed as _opts_ :message.
 452     #
 453     def Utils.get_first_pars(urls, count, opts={})
 454       idx = 0
 455       msg = opts[:message]
 456       while count > 0 and urls.length > 0
 457         url = urls.shift
 458         idx += 1
 459
 460         # FIXME what happens if some big file is returned? We should share
 461         # code with the url plugin to only retrieve partial file content!
 462         xml = opts[:http_util].get_cached(url)
 463         if xml.nil?
 464           debug "Unable to retrieve #{url}"
 465           next
 466         end
 467         debug "Retrieved #{url}"
 468         debug "\t#{xml}"
 469         par = Utils.ircify_first_html_par(xml)
 470         if par.empty?
 471           debug "No first par found\n#{xml}"
 472           # FIXME only do this if the 'url' plugin is loaded
 473           # TODO even better, put the code here
 474           # par = @bot.plugins['url'].get_title_from_html(xml)
 475           next if par.empty?
 476         end
 477         msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
 478         count -=1
 479       end
 480     end
 481
 482
 483   end
 484 end