# TODO some of these Utils should be rewritten as extensions to the approriate
# standard Ruby classes and accordingly be moved to extends.rb
-require 'net/http'
-require 'uri'
require 'tempfile'
+require 'set'
begin
require 'htmlentities'
$we_have_html_entities_decoder = true
rescue LoadError
- if require 'rubygems' rescue false
+ gems = nil
+ begin
+ gems = require 'rubygems'
+ rescue LoadError
+ gems = false
+ end
+ if gems
retry
else
$we_have_html_entities_decoder = false
# miscellaneous useful functions
module Utils
+ @@bot = nil unless defined? @@bot
+ @@safe_save_dir = nil unless defined?(@@safe_save_dir)
+
+ def Utils.bot
+ @@bot
+ end
+
+ def Utils.bot=(b)
+ debug "initializing utils"
+ @@bot = b
+ @@safe_save_dir = "#{@@bot.botclass}/safe_save"
+ end
+
+
SEC_PER_MIN = 60
SEC_PER_HR = SEC_PER_MIN * 60
SEC_PER_DAY = SEC_PER_HR * 24
return p.readlines.join("\n")
else
begin
- $stderr = $stdout
+ $stderr.reopen($stdout)
exec(command, *args)
rescue Exception => e
- puts "exec of #{command} led to exception: #{e.inspect}"
+ puts "exec of #{command} led to exception: #{e.pretty_inspect}"
Kernel::exit! 0
end
puts "exec of #{command} failed"
end
- @@safe_save_dir = nil unless defined?(@@safe_save_dir)
- def Utils.set_safe_save_dir(str)
- @@safe_save_dir = str.dup
- end
-
def Utils.safe_save(file)
raise 'No safe save directory defined!' if @@safe_save_dir.nil?
basename = File.basename(file)
end
- # returns a string containing the result of an HTTP GET on the uri
- def Utils.http_get(uristr, readtimeout=8, opentimeout=4)
-
- # ruby 1.7 or better needed for this (or 1.6 and debian unstable)
- Net::HTTP.version_1_2
- # (so we support the 1_1 api anyway, avoids problems)
-
- uri = URI.parse uristr
- query = uri.path
- if uri.query
- query += "?#{uri.query}"
- end
-
- proxy_host = nil
- proxy_port = nil
- if(ENV['http_proxy'] && proxy_uri = URI.parse(ENV['http_proxy']))
- proxy_host = proxy_uri.host
- proxy_port = proxy_uri.port
- end
-
- begin
- http = Net::HTTP.new(uri.host, uri.port, proxy_host, proxy_port)
- http.open_timeout = opentimeout
- http.read_timeout = readtimeout
-
- http.start {|http|
- resp = http.get(query)
- if resp.code == "200"
- return resp.body
- end
- }
- rescue => e
- # cheesy for now
- error "Utils.http_get exception: #{e.inspect}, while trying to get #{uristr}"
- return nil
- end
- end
-
def Utils.decode_html_entities(str)
if $we_have_html_entities_decoder
return HTMLEntities.decode_entities(str)
HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
- # Some blogging and forum platforms use spans or divs with a 'body' in their class
+ # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
# to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # At worst, we can try stuff which is comprised between two <br>
+ AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
# If possible, grab the one after the first heading
# text
# * :min_spaces => Minimum number of spaces a paragraph should have
#
- def Utils.ircify_first_html_par(xml, opts={})
- txt = String.new
+ def Utils.ircify_first_html_par(xml_org, opts={})
+ xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
min_spaces = opts[:min_spaces] || 8
min_spaces = 0 if min_spaces < 0
+ txt = String.new
+
while true
debug "Minimum number of spaces: #{min_spaces}"
header_found = xml.match(HX_REGEX)
# Nothing yet ... let's get drastic: we look for non-par elements too,
# but only for those that match something that we know is likely to
# contain text
+
+ # Attempt #1
header_found = xml
while txt.empty? or txt.count(" ") < min_spaces
candidate = header_found[AFTER_PAR1_REGEX]
txt = candidate.ircify_html
header_found = $'
txt.sub!(strip, '') if strip
- debug "(other attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+ end
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Attempt #2
+ header_found = xml
+ while txt.empty? or txt.count(" ") < min_spaces
+ candidate = header_found[AFTER_PAR2_REGEX]
+ break unless candidate
+ txt = candidate.ircify_html
+ header_found = $'
+ txt.sub!(strip, '') if strip
+ debug "(other attempt \#2) #{txt.inspect} has #{txt.count(" ")} spaces"
end
debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
end
# Get the first pars of the first _count_ _urls_.
- # The pages are downloaded using an HttpUtil service passed as _opts_ :http_util,
- # and echoed as replies to the IRC message passed as _opts_ :message.
+ # The pages are downloaded using the bot httputil service.
+ # Returns an array of the first paragraphs fetched.
+ # If (optional) _opts_ :message is specified, those paragraphs are
+ # echoed as replies to the IRC message passed as _opts_ :message
#
def Utils.get_first_pars(urls, count, opts={})
idx = 0
msg = opts[:message]
+ retval = Array.new
while count > 0 and urls.length > 0
url = urls.shift
idx += 1
# FIXME what happens if some big file is returned? We should share
# code with the url plugin to only retrieve partial file content!
- xml = opts[:http_util].get_cached(url)
+ xml = self.bot.httputil.get(url)
if xml.nil?
debug "Unable to retrieve #{url}"
next
# FIXME only do this if the 'url' plugin is loaded
# TODO even better, put the code here
# par = @bot.plugins['url'].get_title_from_html(xml)
- next if par.empty?
+ if par.empty?
+ retval.push(nil)
+ next
+ end
end
msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
count -=1
+ retval.push(par)
end
+ return retval
end
-
end
end
+
+Irc::Utils.bot = Irc::Plugins.manager.bot