summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-09-18 06:15:49 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-09-18 06:15:49 +0000
commit0079ca3ac1adb77e4ddc9ebd34149c60d73b7529 (patch)
tree13ef3434003aaa88dec2a015fd403dabec7e952c
parent83cb5754e808afc77ac625ef66e5ff128ee00d4f (diff)
HTML processing refactoring: Utils.get_html_info and related methods factored out of the url plugin
-rw-r--r--data/rbot/plugins/url.rb94
-rw-r--r--lib/rbot/core/utils/utils.rb85
2 files changed, 113 insertions, 66 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb
index 0809288f..7a752ec6 100644
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -5,9 +5,6 @@
define_structure :Url, :channel, :nick, :time, :url, :info
-class ::UrlLinkError < RuntimeError
-end
-
class UrlPlugin < Plugin
LINK_INFO = "[Link Info]"
OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
@@ -67,75 +64,34 @@ class UrlPlugin < Plugin
logopts = opts.dup
title = nil
- extra = String.new
+ extra = []
begin
- debug "+ getting #{url.request_uri}"
- @bot.httputil.get_response(url) { |resp|
- case resp
- when Net::HTTPSuccess
-
- debug resp.to_hash
+ debug "+ getting info for #{url.request_uri}"
+ info = Utils.get_html_info(url)
+ debug info
+ resp = info[:headers]
- if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
- # The page is text or HTML, so we can try finding a title and, if
- # requested, the first par.
- #
- # We act differently depending on whether we want the first par or
- # not: in the first case we download the initial part and the parse
- # it; in the second case we only download as much as we need to find
- # the title
- #
- if @bot.config['url.first_par']
- partial = resp.partial_body(@bot.config['http.info_bytes'])
- logopts[:title] = title = get_title_from_html(partial)
- if url.fragment and not url.fragment.empty?
- fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
- partial.sub!(fragreg,'')
- end
- first_par = Utils.ircify_first_html_par(partial, :strip => title)
- unless first_par.empty?
- logopts[:extra] = first_par
- extra << ", #{Bold}text#{Bold}: #{first_par}"
- end
- call_event(:url_added, url.to_s, logopts)
- return "#{Bold}title#{Bold}: #{title}#{extra}" if title
- else
- resp.partial_body(@bot.config['http.info_bytes']) { |part|
- logopts[:title] = title = get_title_from_html(part)
- call_event(:url_added, url.to_s, logopts)
- return "#{Bold}title#{Bold}: #{title}" if title
- }
- end
- # if nothing was found, provide more basic info, as for non-html pages
- else
- resp.no_cache = true
- end
+ logopts[:title] = title = info[:title]
- enc = resp['content-encoding']
- logopts[:extra] = String.new
- logopts[:extra] << "Content Type: #{resp['content-type']}"
- if enc
- logopts[:extra] << ", encoding: #{enc}"
- extra << ", #{Bold}encoding#{Bold}: #{enc}"
- end
+ if info[:content]
+ logopts[:extra] = info[:content]
+ extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
+ else
+ logopts[:extra] = String.new
+ logopts[:extra] << "Content Type: #{resp['content-type']}"
+ extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
+ if enc = resp['content-encoding']
+ logopts[:extra] << ", encoding: #{enc}"
+ extra << "#{Bold}encoding#{Bold}: #{enc}"
+ end
- unless @bot.config['url.titles_only']
- # content doesn't have title, just display info.
- size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
- if size
- logopts[:extra] << ", size: #{size} bytes"
- size = ", #{Bold}size#{Bold}: #{size} bytes"
- end
- call_event(:url_added, url.to_s, logopts)
- return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
- end
- call_event(:url_added, url.to_s, logopts)
- else
- raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+ size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
+ if size
+ logopts[:extra] << ", size: #{size} bytes"
+ extra << "#{Bold}size#{Bold}: #{size} bytes"
end
- }
- return nil
+ end
rescue Exception => e
case e
when UrlLinkError
@@ -145,6 +101,12 @@ class UrlPlugin < Plugin
raise "connecting to site/processing information (#{e.message})"
end
end
+
+ call_event(:url_added, url.to_s, logopts)
+ if title
+ extra.unshift("#{Bold}title#{Bold}: #{title}")
+ end
+ return extra.join(", ") if title or not @bot.config['url.titles_only']
end
def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 0b10b52f..8c23b2cf 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -656,6 +656,91 @@ module ::Irc
end
end
+ # This method extracts title, content (first par) and extra
+ # information from the given document _doc_.
+ #
+ # _doc_ can be an URI, a Net::HTTPResponse or a String.
+ #
+ # If _doc_ is a String, only title and content information
+ # are retrieved (if possible), using standard methods.
+ #
+ # If _doc_ is an URI or a Net::HTTPResponse, additional
+ # information is retrieved, and special title/summary
+ # extraction routines are used if possible.
+ #
+ def Utils.get_html_info(doc, opts={})
+ case doc
+ when String
+ Utils.get_string_html_info(doc, opts)
+ when Net::HTTPResponse
+ Utils.get_resp_html_info(doc, opts)
+ when URI
+ if doc.fragment and not doc.fragment.empty?
+ opts[:uri_fragment] ||= doc.fragment
+ end
+ ret = Hash.new
+ @@bot.httputil.get_response(doc) { |resp|
+ ret = Utils.get_resp_html_info(resp, opts)
+ }
+ return ret
+ else
+ raise
+ end
+ end
+
+ class ::UrlLinkError < RuntimeError
+ end
+
+ # This method extracts title, content (first par) and extra
+ # information from the given Net::HTTPResponse _resp_.
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ # Returns a Hash with the following keys:
+ # title:: the title of the document (if any)
+ # content:: the first paragraph of the document (if any)
+ # headers::
+ # the headers of the Net::HTTPResponse. The value is
+ # a Hash whose keys are lowercase forms of the HTTP
+ # header fields, and whose values are Arrays.
+ #
+ def Utils.get_resp_html_info(resp, opts={})
+ ret = Hash.new
+ case resp
+ when Net::HTTPSuccess
+ ret[:headers] = resp.to_hash
+
+ if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+ partial = resp.partial_body(@@bot.config['http.info_bytes'])
+ ret.merge!(Utils.get_string_html_info(partial, opts))
+ end
+ return ret
+ else
+ raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+ end
+ end
+
+ # This method extracts title and content (first par)
+ # from the given HTML or XML document _text_, using
+ # standard methods (String#ircify_html_title,
+ # Utils.ircify_first_html_par)
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ def Utils.get_string_html_info(text, opts={})
+ txt = text.dup
+ title = txt.ircify_html_title
+ if frag = opts[:uri_fragment] and not frag.empty?
+ fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
+ txt.sub!(fragreg,'')
+ end
+ content = Utils.ircify_first_html_par(txt, :strip => title)
+ content = nil if content.empty?
+ return {:title => title, :content => content}
+ end
+
# Get the first pars of the first _count_ _urls_.
# The pages are downloaded using the bot httputil service.
# Returns an array of the first paragraphs fetched.