lib/rbot/core/utils/utils.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: rbot utilities provider
   5 #
   6 # Author:: Tom Gilbert <tom@linuxbrit.co.uk>
   7 # Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
   8 #
   9 # Copyright:: (C) 2002-2006 Tom Gilbert
  10 # Copyright:: (C) 2007 Giuseppe Bilotta
  11 #
  12 # TODO some of these Utils should be rewritten as extensions to the approriate
  13 # standard Ruby classes and accordingly be moved to extends.rb
  14
  15 require 'tempfile'
  16 require 'set'
  17
  18 # Try to load htmlentities, fall back to an HTML escape table.
  19 begin
  20   require 'htmlentities'
  21 rescue LoadError
  22   gems = nil
  23   begin
  24     gems = require 'rubygems'
  25   rescue LoadError
  26     gems = false
  27   end
  28   if gems
  29     retry
  30   else
  31     module ::Irc
  32       module Utils
  33         UNESCAPE_TABLE = {
  34     'laquo' => '<<',
  35     'raquo' => '>>',
  36     'quot' => '"',
  37     'apos' => '\'',
  38     'micro' => 'u',
  39     'copy' => '(c)',
  40     'trade' => '(tm)',
  41     'reg' => '(R)',
  42     '#174' => '(R)',
  43     '#8220' => '"',
  44     '#8221' => '"',
  45     '#8212' => '--',
  46     '#39' => '\'',
  47     'amp' => '&',
  48     'lt' => '<',
  49     'gt' => '>',
  50     'hellip' => '...',
  51     'nbsp' => ' ',
  52 =begin
  53     # extras codes, for future use...
  54     'zwnj' => '&#8204;',
  55     'aring' => '\xe5',
  56     'gt' => '>',
  57     'yen' => '\xa5',
  58     'ograve' => '\xf2',
  59     'Chi' => '&#935;',
  60     'bull' => '&#8226;',
  61     'Egrave' => '\xc8',
  62     'Ntilde' => '\xd1',
  63     'upsih' => '&#978;',
  64     'Yacute' => '\xdd',
  65     'asymp' => '&#8776;',
  66     'radic' => '&#8730;',
  67     'otimes' => '&#8855;',
  68     'nabla' => '&#8711;',
  69     'aelig' => '\xe6',
  70     'oelig' => '&#339;',
  71     'equiv' => '&#8801;',
  72     'Psi' => '&#936;',
  73     'auml' => '\xe4',
  74     'circ' => '&#710;',
  75     'Acirc' => '\xc2',
  76     'Epsilon' => '&#917;',
  77     'Yuml' => '&#376;',
  78     'Eta' => '&#919;',
  79     'Icirc' => '\xce',
  80     'Upsilon' => '&#933;',
  81     'ndash' => '&#8211;',
  82     'there4' => '&#8756;',
  83     'Prime' => '&#8243;',
  84     'prime' => '&#8242;',
  85     'psi' => '&#968;',
  86     'Kappa' => '&#922;',
  87     'rsaquo' => '&#8250;',
  88     'Tau' => '&#932;',
  89     'darr' => '&#8595;',
  90     'ocirc' => '\xf4',
  91     'lrm' => '&#8206;',
  92     'zwj' => '&#8205;',
  93     'cedil' => '\xb8',
  94     'Ecirc' => '\xca',
  95     'not' => '\xac',
  96     'AElig' => '\xc6',
  97     'oslash' => '\xf8',
  98     'acute' => '\xb4',
  99     'lceil' => '&#8968;',
 100     'shy' => '\xad',
 101     'rdquo' => '&#8221;',
 102     'ge' => '&#8805;',
 103     'Igrave' => '\xcc',
 104     'Ograve' => '\xd2',
 105     'euro' => '&#8364;',
 106     'dArr' => '&#8659;',
 107     'sdot' => '&#8901;',
 108     'nbsp' => '\xa0',
 109     'lfloor' => '&#8970;',
 110     'lArr' => '&#8656;',
 111     'Auml' => '\xc4',
 112     'larr' => '&#8592;',
 113     'Atilde' => '\xc3',
 114     'Otilde' => '\xd5',
 115     'szlig' => '\xdf',
 116     'clubs' => '&#9827;',
 117     'diams' => '&#9830;',
 118     'agrave' => '\xe0',
 119     'Ocirc' => '\xd4',
 120     'Iota' => '&#921;',
 121     'Theta' => '&#920;',
 122     'Pi' => '&#928;',
 123     'OElig' => '&#338;',
 124     'Scaron' => '&#352;',
 125     'frac14' => '\xbc',
 126     'egrave' => '\xe8',
 127     'sub' => '&#8834;',
 128     'iexcl' => '\xa1',
 129     'frac12' => '\xbd',
 130     'sbquo' => '&#8218;',
 131     'ordf' => '\xaa',
 132     'sum' => '&#8721;',
 133     'prop' => '&#8733;',
 134     'Uuml' => '\xdc',
 135     'ntilde' => '\xf1',
 136     'sup' => '&#8835;',
 137     'theta' => '&#952;',
 138     'prod' => '&#8719;',
 139     'nsub' => '&#8836;',
 140     'hArr' => '&#8660;',
 141     'rlm' => '&#8207;',
 142     'THORN' => '\xde',
 143     'infin' => '&#8734;',
 144     'yuml' => '\xff',
 145     'Mu' => '&#924;',
 146     'le' => '&#8804;',
 147     'Eacute' => '\xc9',
 148     'thinsp' => '&#8201;',
 149     'ecirc' => '\xea',
 150     'bdquo' => '&#8222;',
 151     'Sigma' => '&#931;',
 152     'fnof' => '&#402;',
 153     'Aring' => '\xc5',
 154     'tilde' => '&#732;',
 155     'frac34' => '\xbe',
 156     'emsp' => '&#8195;',
 157     'mdash' => '&#8212;',
 158     'uarr' => '&#8593;',
 159     'permil' => '&#8240;',
 160     'Ugrave' => '\xd9',
 161     'rarr' => '&#8594;',
 162     'Agrave' => '\xc0',
 163     'chi' => '&#967;',
 164     'forall' => '&#8704;',
 165     'eth' => '\xf0',
 166     'rceil' => '&#8969;',
 167     'iuml' => '\xef',
 168     'gamma' => '&#947;',
 169     'lambda' => '&#955;',
 170     'harr' => '&#8596;',
 171     'rang' => '&#9002;',
 172     'xi' => '&#958;',
 173     'dagger' => '&#8224;',
 174     'divide' => '\xf7',
 175     'Ouml' => '\xd6',
 176     'image' => '&#8465;',
 177     'alefsym' => '&#8501;',
 178     'igrave' => '\xec',
 179     'otilde' => '\xf5',
 180     'Oacute' => '\xd3',
 181     'sube' => '&#8838;',
 182     'alpha' => '&#945;',
 183     'frasl' => '&#8260;',
 184     'ETH' => '\xd0',
 185     'lowast' => '&#8727;',
 186     'Nu' => '&#925;',
 187     'plusmn' => '\xb1',
 188     'Euml' => '\xcb',
 189     'real' => '&#8476;',
 190     'sup1' => '\xb9',
 191     'sup2' => '\xb2',
 192     'sup3' => '\xb3',
 193     'Oslash' => '\xd8',
 194     'Aacute' => '\xc1',
 195     'cent' => '\xa2',
 196     'oline' => '&#8254;',
 197     'Beta' => '&#914;',
 198     'perp' => '&#8869;',
 199     'Delta' => '&#916;',
 200     'loz' => '&#9674;',
 201     'pi' => '&#960;',
 202     'iota' => '&#953;',
 203     'empty' => '&#8709;',
 204     'euml' => '\xeb',
 205     'brvbar' => '\xa6',
 206     'iacute' => '\xed',
 207     'para' => '\xb6',
 208     'micro' => '\xb5',
 209     'cup' => '&#8746;',
 210     'weierp' => '&#8472;',
 211     'uuml' => '\xfc',
 212     'part' => '&#8706;',
 213     'icirc' => '\xee',
 214     'delta' => '&#948;',
 215     'omicron' => '&#959;',
 216     'upsilon' => '&#965;',
 217     'Iuml' => '\xcf',
 218     'Lambda' => '&#923;',
 219     'Xi' => '&#926;',
 220     'kappa' => '&#954;',
 221     'ccedil' => '\xe7',
 222     'Ucirc' => '\xdb',
 223     'cap' => '&#8745;',
 224     'mu' => '&#956;',
 225     'scaron' => '&#353;',
 226     'lsquo' => '&#8216;',
 227     'isin' => '&#8712;',
 228     'Zeta' => '&#918;',
 229     'supe' => '&#8839;',
 230     'deg' => '\xb0',
 231     'and' => '&#8743;',
 232     'tau' => '&#964;',
 233     'pound' => '\xa3',
 234     'hellip' => '&#8230;',
 235     'curren' => '\xa4',
 236     'int' => '&#8747;',
 237     'ucirc' => '\xfb',
 238     'rfloor' => '&#8971;',
 239     'ensp' => '&#8194;',
 240     'crarr' => '&#8629;',
 241     'ugrave' => '\xf9',
 242     'notin' => '&#8713;',
 243     'exist' => '&#8707;',
 244     'uArr' => '&#8657;',
 245     'cong' => '&#8773;',
 246     'Dagger' => '&#8225;',
 247     'oplus' => '&#8853;',
 248     'times' => '\xd7',
 249     'atilde' => '\xe3',
 250     'piv' => '&#982;',
 251     'ni' => '&#8715;',
 252     'Phi' => '&#934;',
 253     'lsaquo' => '&#8249;',
 254     'Uacute' => '\xda',
 255     'Omicron' => '&#927;',
 256     'ang' => '&#8736;',
 257     'ne' => '&#8800;',
 258     'iquest' => '\xbf',
 259     'eta' => '&#951;',
 260     'yacute' => '\xfd',
 261     'Rho' => '&#929;',
 262     'uacute' => '\xfa',
 263     'Alpha' => '&#913;',
 264     'zeta' => '&#950;',
 265     'Omega' => '&#937;',
 266     'nu' => '&#957;',
 267     'sim' => '&#8764;',
 268     'sect' => '\xa7',
 269     'phi' => '&#966;',
 270     'sigmaf' => '&#962;',
 271     'macr' => '\xaf',
 272     'minus' => '&#8722;',
 273     'Ccedil' => '\xc7',
 274     'ordm' => '\xba',
 275     'epsilon' => '&#949;',
 276     'beta' => '&#946;',
 277     'rArr' => '&#8658;',
 278     'rho' => '&#961;',
 279     'aacute' => '\xe1',
 280     'eacute' => '\xe9',
 281     'omega' => '&#969;',
 282     'middot' => '\xb7',
 283     'Gamma' => '&#915;',
 284     'Iacute' => '\xcd',
 285     'lang' => '&#9001;',
 286     'spades' => '&#9824;',
 287     'rsquo' => '&#8217;',
 288     'uml' => '\xa8',
 289     'thorn' => '\xfe',
 290     'ouml' => '\xf6',
 291     'thetasym' => '&#977;',
 292     'or' => '&#8744;',
 293     'raquo' => '\xbb',
 294     'acirc' => '\xe2',
 295     'ldquo' => '&#8220;',
 296     'hearts' => '&#9829;',
 297     'sigma' => '&#963;',
 298     'oacute' => '\xf3',
 299 =end
 300         }
 301       end
 302     end
 303   end
 304 end
 305
 306 begin
 307   require 'hpricot'
 308   module ::Irc
 309     module Utils
 310       AFTER_PAR_PATH = /^(?:div|span)$/
 311       AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/
 312       AFTER_PAR_CLASS = /body|message|text/i
 313     end
 314   end
 315 rescue LoadError
 316   gems = nil
 317   begin
 318     gems = require 'rubygems'
 319   rescue LoadError
 320     gems = false
 321   end
 322   if gems
 323     retry
 324   else
 325     module ::Irc
 326       module Utils
 327         # Some regular expressions to manage HTML data
 328
 329         # Title
 330         TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
 331
 332         # H1, H2, etc
 333         HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
 334         # A paragraph
 335         PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 336
 337         # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
 338         # to mark actual text
 339         AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 340
 341         # At worst, we can try stuff which is comprised between two <br>
 342         AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
 343       end
 344     end
 345   end
 346 end
 347
 348 module ::Irc
 349
 350   # Miscellaneous useful functions
 351   module Utils
 352     @@bot = nil unless defined? @@bot
 353     @@safe_save_dir = nil unless defined?(@@safe_save_dir)
 354
 355     # The bot instance
 356     def Utils.bot
 357       @@bot
 358     end
 359
 360     # Set up some Utils routines which depend on the associated bot.
 361     def Utils.bot=(b)
 362       debug "initializing utils"
 363       @@bot = b
 364       @@safe_save_dir = "#{@@bot.botclass}/safe_save"
 365     end
 366
 367
 368     # Seconds per minute
 369     SEC_PER_MIN = 60
 370     # Seconds per hour
 371     SEC_PER_HR = SEC_PER_MIN * 60
 372     # Seconds per day
 373     SEC_PER_DAY = SEC_PER_HR * 24
 374     # Seconds per (30-day) month
 375     SEC_PER_MNTH = SEC_PER_DAY * 30
 376     # Second per (30*12 = 360 day) year
 377     SEC_PER_YR = SEC_PER_MNTH * 12
 378
 379     # Auxiliary method needed by Utils.secs_to_string
 380     def Utils.secs_to_string_case(array, var, string, plural)
 381       case var
 382       when 1
 383         array << "1 #{string}"
 384       else
 385         array << "#{var} #{plural}"
 386       end
 387     end
 388
 389     # Turn a number of seconds into a human readable string, e.g
 390     # 2 days, 3 hours, 18 minutes and 10 seconds
 391     def Utils.secs_to_string(secs)
 392       ret = []
 393       years, secs = secs.divmod SEC_PER_YR
 394       secs_to_string_case(ret, years, _("year"), _("years")) if years > 0
 395       months, secs = secs.divmod SEC_PER_MNTH
 396       secs_to_string_case(ret, months, _("month"), _("months")) if months > 0
 397       days, secs = secs.divmod SEC_PER_DAY
 398       secs_to_string_case(ret, days, _("day"), _("days")) if days > 0
 399       hours, secs = secs.divmod SEC_PER_HR
 400       secs_to_string_case(ret, hours, _("hour"), _("hours")) if hours > 0
 401       mins, secs = secs.divmod SEC_PER_MIN
 402       secs_to_string_case(ret, mins, _("minute"), _("minutes")) if mins > 0
 403       secs = secs.to_i
 404       secs_to_string_case(ret, secs, _("second"), _("seconds")) if secs > 0 or ret.empty?
 405       case ret.length
 406       when 0
 407         raise "Empty ret array!"
 408       when 1
 409         return ret.to_s
 410       else
 411         return [ret[0, ret.length-1].join(", ") , ret[-1]].join(_(" and "))
 412       end
 413     end
 414
 415
 416     # Execute an external program, returning a String obtained by redirecting
 417     # the program's standards errors and output
 418     #
 419     def Utils.safe_exec(command, *args)
 420       IO.popen("-") { |p|
 421         if p
 422           return p.readlines.join("\n")
 423         else
 424           begin
 425             $stderr.reopen($stdout)
 426             exec(command, *args)
 427           rescue Exception => e
 428             puts "exec of #{command} led to exception: #{e.pretty_inspect}"
 429             Kernel::exit! 0
 430           end
 431           puts "exec of #{command} failed"
 432           Kernel::exit! 0
 433         end
 434       }
 435     end
 436
 437
 438     # Safely (atomically) save to _file_, by passing a tempfile to the block
 439     # and then moving the tempfile to its final location when done.
 440     #
 441     # call-seq: Utils.safe_save(file, &block)
 442     #
 443     def Utils.safe_save(file)
 444       raise 'No safe save directory defined!' if @@safe_save_dir.nil?
 445       basename = File.basename(file)
 446       temp = Tempfile.new(basename,@@safe_save_dir)
 447       temp.binmode
 448       yield temp if block_given?
 449       temp.close
 450       File.rename(temp.path, file)
 451     end
 452
 453
 454     # Decode HTML entities in the String _str_, using HTMLEntities if the
 455     # package was found, or UNESCAPE_TABLE otherwise.
 456     #
 457     def Utils.decode_html_entities(str)
 458       if defined? ::HTMLEntities
 459         return HTMLEntities.decode_entities(str)
 460       else
 461         str.gsub(/(&(.+?);)/) {
 462           symbol = $2
 463           # remove the 0-paddng from unicode integers
 464           if symbol =~ /#(.+)/
 465             symbol = "##{$1.to_i.to_s}"
 466           end
 467
 468           # output the symbol's irc-translated character, or a * if it's unknown
 469           UNESCAPE_TABLE[symbol] || (symbol.match(/^\d+$/) ? [$0.to_i].pack("U") : '*')
 470         }
 471       end
 472     end
 473
 474     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
 475     # If possible, grab the one after the first heading
 476     #
 477     # It is possible to pass some options to determine how the stripping
 478     # occurs. Currently supported options are
 479     # strip:: Regex or String to strip at the beginning of the obtained
 480     #         text
 481     # min_spaces:: minimum number of spaces a paragraph should have
 482     #
 483     def Utils.ircify_first_html_par(xml_org, opts={})
 484       if defined? ::Hpricot
 485         Utils.ircify_first_html_par_wh(xml_org, opts)
 486       else
 487         Utils.ircify_first_html_par_woh(xml_org, opts)
 488       end
 489     end
 490
 491     # HTML first par grabber using hpricot
 492     def Utils.ircify_first_html_par_wh(xml_org, opts={})
 493       doc = Hpricot(xml_org)
 494
 495       # Strip styles and scripts
 496       (doc/"style|script").remove
 497
 498       debug doc
 499
 500       strip = opts[:strip]
 501       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 502
 503       min_spaces = opts[:min_spaces] || 8
 504       min_spaces = 0 if min_spaces < 0
 505
 506       txt = String.new
 507
 508       pre_h = pars = by_span = nil
 509
 510       while true
 511         debug "Minimum number of spaces: #{min_spaces}"
 512
 513         # Initial attempt: <p> that follows <h\d>
 514         if pre_h.nil?
 515           pre_h = Hpricot::Elements[]
 516           found_h = false
 517           doc.search("*") { |e|
 518             next if e.bogusetag?
 519             case e.pathname
 520             when /^h\d/
 521               found_h = true
 522             when 'p'
 523               pre_h << e if found_h
 524             end
 525           }
 526           debug "Hx: found: #{pre_h.pretty_inspect}"
 527         end
 528
 529         pre_h.each { |p|
 530           debug p
 531           txt = p.to_html.ircify_html
 532           txt.sub!(strip, '') if strip
 533           debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 534           break unless txt.empty? or txt.count(" ") < min_spaces
 535         }
 536
 537         return txt unless txt.empty? or txt.count(" ") < min_spaces
 538
 539         # Second natural attempt: just get any <p>
 540         pars = doc/"p" if pars.nil?
 541         debug "par: found: #{pars.pretty_inspect}"
 542         pars.each { |p|
 543           debug p
 544           txt = p.to_html.ircify_html
 545           txt.sub!(strip, '') if strip
 546           debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 547           break unless txt.empty? or txt.count(" ") < min_spaces
 548         }
 549
 550         return txt unless txt.empty? or txt.count(" ") < min_spaces
 551
 552         # Nothing yet ... let's get drastic: we look for non-par elements too,
 553         # but only for those that match something that we know is likely to
 554         # contain text
 555
 556         # Some blogging and forum platforms use spans or divs with a 'body' or
 557         # 'message' or 'text' in their class to mark actual text. Since we want
 558         # the class match to be partial and case insensitive, we collect
 559         # the common elements that may have this class and then filter out those
 560         # we don't need. If no divs or spans are found, we'll accept additional
 561         # elements too (td, tr, tbody, table).
 562         if by_span.nil?
 563           by_span = Hpricot::Elements[]
 564           extra = Hpricot::Elements[]
 565           doc.search("*") { |el|
 566             next if el.bogusetag?
 567             case el.pathname
 568             when AFTER_PAR_PATH
 569               by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
 570             when AFTER_PAR_EX
 571               extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
 572             end
 573           }
 574           if by_span.empty? and not extra.empty?
 575             by_span.concat extra
 576           end
 577           debug "other \#1: found: #{by_span.pretty_inspect}"
 578         end
 579
 580         by_span.each { |p|
 581           debug p
 582           txt = p.to_html.ircify_html
 583           txt.sub!(strip, '') if strip
 584           debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
 585           break unless txt.empty? or txt.count(" ") < min_spaces
 586         }
 587
 588         return txt unless txt.empty? or txt.count(" ") < min_spaces
 589
 590         # At worst, we can try stuff which is comprised between two <br>
 591         # TODO
 592
 593         debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
 594         return txt unless txt.count(" ") < min_spaces
 595         break if min_spaces == 0
 596         min_spaces /= 2
 597       end
 598     end
 599
 600     # HTML first par grabber without hpricot
 601     def Utils.ircify_first_html_par_woh(xml_org, opts={})
 602       xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
 603
 604       strip = opts[:strip]
 605       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
 606
 607       min_spaces = opts[:min_spaces] || 8
 608       min_spaces = 0 if min_spaces < 0
 609
 610       txt = String.new
 611
 612       while true
 613         debug "Minimum number of spaces: #{min_spaces}"
 614         header_found = xml.match(HX_REGEX)
 615         if header_found
 616           header_found = $'
 617           while txt.empty? or txt.count(" ") < min_spaces
 618             candidate = header_found[PAR_REGEX]
 619             break unless candidate
 620             txt = candidate.ircify_html
 621             header_found = $'
 622             txt.sub!(strip, '') if strip
 623             debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 624           end
 625         end
 626
 627         return txt unless txt.empty? or txt.count(" ") < min_spaces
 628
 629         # If we haven't found a first par yet, try to get it from the whole
 630         # document
 631         header_found = xml
 632         while txt.empty? or txt.count(" ") < min_spaces
 633           candidate = header_found[PAR_REGEX]
 634           break unless candidate
 635           txt = candidate.ircify_html
 636           header_found = $'
 637           txt.sub!(strip, '') if strip
 638           debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
 639         end
 640
 641         return txt unless txt.empty? or txt.count(" ") < min_spaces
 642
 643         # Nothing yet ... let's get drastic: we look for non-par elements too,
 644         # but only for those that match something that we know is likely to
 645         # contain text
 646
 647         # Attempt #1
 648         header_found = xml
 649         while txt.empty? or txt.count(" ") < min_spaces
 650           candidate = header_found[AFTER_PAR1_REGEX]
 651           break unless candidate
 652           txt = candidate.ircify_html
 653           header_found = $'
 654           txt.sub!(strip, '') if strip
 655           debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
 656         end
 657
 658         return txt unless txt.empty? or txt.count(" ") < min_spaces
 659
 660         # Attempt #2
 661         header_found = xml
 662         while txt.empty? or txt.count(" ") < min_spaces
 663           candidate = header_found[AFTER_PAR2_REGEX]
 664           break unless candidate
 665           txt = candidate.ircify_html
 666           header_found = $'
 667           txt.sub!(strip, '') if strip
 668           debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
 669         end
 670
 671         debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
 672         return txt unless txt.count(" ") < min_spaces
 673         break if min_spaces == 0
 674         min_spaces /= 2
 675       end
 676     end
 677
 678     # This method extracts title, content (first par) and extra
 679     # information from the given document _doc_.
 680     #
 681     # _doc_ can be an URI, a Net::HTTPResponse or a String.
 682     #
 683     # If _doc_ is a String, only title and content information
 684     # are retrieved (if possible), using standard methods.
 685     #
 686     # If _doc_ is an URI or a Net::HTTPResponse, additional
 687     # information is retrieved, and special title/summary
 688     # extraction routines are used if possible.
 689     #
 690     def Utils.get_html_info(doc, opts={})
 691       case doc
 692       when String
 693         Utils.get_string_html_info(doc, opts)
 694       when Net::HTTPResponse
 695         Utils.get_resp_html_info(doc, opts)
 696       when URI
 697         if doc.fragment and not doc.fragment.empty?
 698           opts[:uri_fragment] ||= doc.fragment
 699         end
 700         ret = Hash.new
 701         @@bot.httputil.get_response(doc) { |resp|
 702           ret = Utils.get_resp_html_info(resp, opts)
 703         }
 704         return ret
 705       else
 706         raise
 707       end
 708     end
 709
 710     class ::UrlLinkError < RuntimeError
 711     end
 712
 713     # This method extracts title, content (first par) and extra
 714     # information from the given Net::HTTPResponse _resp_.
 715     #
 716     # Currently, the only accepted option (in _opts_) is
 717     # uri_fragment:: the URI fragment of the original request
 718     #
 719     # Returns a Hash with the following keys:
 720     # title:: the title of the document (if any)
 721     # content:: the first paragraph of the document (if any)
 722     # headers::
 723     #   the headers of the Net::HTTPResponse. The value is
 724     #   a Hash whose keys are lowercase forms of the HTTP
 725     #   header fields, and whose values are Arrays.
 726     #
 727     def Utils.get_resp_html_info(resp, opts={})
 728       ret = Hash.new
 729       case resp
 730       when Net::HTTPSuccess
 731         ret[:headers] = resp.to_hash
 732
 733         if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
 734           partial = resp.partial_body(@@bot.config['http.info_bytes'])
 735           ret.merge!(Utils.get_string_html_info(partial, opts))
 736         end
 737         return ret
 738       else
 739         raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 740       end
 741     end
 742
 743     # This method extracts title and content (first par)
 744     # from the given HTML or XML document _text_, using
 745     # standard methods (String#ircify_html_title,
 746     # Utils.ircify_first_html_par)
 747     #
 748     # Currently, the only accepted option (in _opts_) is
 749     # uri_fragment:: the URI fragment of the original request
 750     #
 751     def Utils.get_string_html_info(text, opts={})
 752       txt = text.dup
 753       title = txt.ircify_html_title
 754       if frag = opts[:uri_fragment] and not frag.empty?
 755         fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
 756         txt.sub!(fragreg,'')
 757       end
 758       c_opts = opts.dup
 759       c_opts[:strip] ||= title
 760       content = Utils.ircify_first_html_par(txt, c_opts)
 761       content = nil if content.empty?
 762       return {:title => title, :content => content}
 763     end
 764
 765     # Get the first pars of the first _count_ _urls_.
 766     # The pages are downloaded using the bot httputil service.
 767     # Returns an array of the first paragraphs fetched.
 768     # If (optional) _opts_ :message is specified, those paragraphs are
 769     # echoed as replies to the IRC message passed as _opts_ :message
 770     #
 771     def Utils.get_first_pars(urls, count, opts={})
 772       idx = 0
 773       msg = opts[:message]
 774       retval = Array.new
 775       while count > 0 and urls.length > 0
 776         url = urls.shift
 777         idx += 1
 778
 779         begin
 780           info = Utils.get_html_info(URI.parse(url), opts)
 781
 782           par = info[:content]
 783           retval.push(par)
 784
 785           if par
 786             msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
 787             count -=1
 788           end
 789         rescue
 790           debug "Unable to retrieve #{url}: #{$!}"
 791           next
 792         end
 793       end
 794       return retval
 795     end
 796
 797   end
 798 end
 799
 800 Irc::Utils.bot = Irc::Bot::Plugins.manager.bot