diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-09-18 06:15:49 +0000 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-09-18 06:15:49 +0000 |
commit | 0079ca3ac1adb77e4ddc9ebd34149c60d73b7529 (patch) | |
tree | 13ef3434003aaa88dec2a015fd403dabec7e952c | |
parent | 83cb5754e808afc77ac625ef66e5ff128ee00d4f (diff) |
HTML processing refactoring: Utils.get_html_info and related methods factored out of the url plugin
-rw-r--r-- | data/rbot/plugins/url.rb | 94 | ||||
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 85 |
2 files changed, 113 insertions, 66 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 0809288f..7a752ec6 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -5,9 +5,6 @@ define_structure :Url, :channel, :nick, :time, :url, :info -class ::UrlLinkError < RuntimeError -end - class UrlPlugin < Plugin LINK_INFO = "[Link Info]" OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N') @@ -67,75 +64,34 @@ class UrlPlugin < Plugin logopts = opts.dup title = nil - extra = String.new + extra = [] begin - debug "+ getting #{url.request_uri}" - @bot.httputil.get_response(url) { |resp| - case resp - when Net::HTTPSuccess - - debug resp.to_hash + debug "+ getting info for #{url.request_uri}" + info = Utils.get_html_info(url) + debug info + resp = info[:headers] - if resp['content-type'] =~ /^text\/|(?:x|ht)ml/ - # The page is text or HTML, so we can try finding a title and, if - # requested, the first par. - # - # We act differently depending on whether we want the first par or - # not: in the first case we download the initial part and the parse - # it; in the second case we only download as much as we need to find - # the title - # - if @bot.config['url.first_par'] - partial = resp.partial_body(@bot.config['http.info_bytes']) - logopts[:title] = title = get_title_from_html(partial) - if url.fragment and not url.fragment.empty? - fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im - partial.sub!(fragreg,'') - end - first_par = Utils.ircify_first_html_par(partial, :strip => title) - unless first_par.empty? - logopts[:extra] = first_par - extra << ", #{Bold}text#{Bold}: #{first_par}" - end - call_event(:url_added, url.to_s, logopts) - return "#{Bold}title#{Bold}: #{title}#{extra}" if title - else - resp.partial_body(@bot.config['http.info_bytes']) { |part| - logopts[:title] = title = get_title_from_html(part) - call_event(:url_added, url.to_s, logopts) - return "#{Bold}title#{Bold}: #{title}" if title - } - end - # if nothing was found, provide more basic info, as for non-html pages - else - resp.no_cache = true - end + logopts[:title] = title = info[:title] - enc = resp['content-encoding'] - logopts[:extra] = String.new - logopts[:extra] << "Content Type: #{resp['content-type']}" - if enc - logopts[:extra] << ", encoding: #{enc}" - extra << ", #{Bold}encoding#{Bold}: #{enc}" - end + if info[:content] + logopts[:extra] = info[:content] + extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par'] + else + logopts[:extra] = String.new + logopts[:extra] << "Content Type: #{resp['content-type']}" + extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title + if enc = resp['content-encoding'] + logopts[:extra] << ", encoding: #{enc}" + extra << "#{Bold}encoding#{Bold}: #{enc}" + end - unless @bot.config['url.titles_only'] - # content doesn't have title, just display info. - size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil - if size - logopts[:extra] << ", size: #{size} bytes" - size = ", #{Bold}size#{Bold}: #{size} bytes" - end - call_event(:url_added, url.to_s, logopts) - return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}" - end - call_event(:url_added, url.to_s, logopts) - else - raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})" + size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil + if size + logopts[:extra] << ", size: #{size} bytes" + extra << "#{Bold}size#{Bold}: #{size} bytes" end - } - return nil + end rescue Exception => e case e when UrlLinkError @@ -145,6 +101,12 @@ class UrlPlugin < Plugin raise "connecting to site/processing information (#{e.message})" end end + + call_event(:url_added, url.to_s, logopts) + if title + extra.unshift("#{Bold}title#{Bold}: #{title}") + end + return extra.join(", ") if title or not @bot.config['url.titles_only'] end def handle_urls(m, urls, display_info=@bot.config['url.display_link_info']) diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 0b10b52f..8c23b2cf 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -656,6 +656,91 @@ module ::Irc end end + # This method extracts title, content (first par) and extra + # information from the given document _doc_. + # + # _doc_ can be an URI, a Net::HTTPResponse or a String. + # + # If _doc_ is a String, only title and content information + # are retrieved (if possible), using standard methods. + # + # If _doc_ is an URI or a Net::HTTPResponse, additional + # information is retrieved, and special title/summary + # extraction routines are used if possible. + # + def Utils.get_html_info(doc, opts={}) + case doc + when String + Utils.get_string_html_info(doc, opts) + when Net::HTTPResponse + Utils.get_resp_html_info(doc, opts) + when URI + if doc.fragment and not doc.fragment.empty? + opts[:uri_fragment] ||= doc.fragment + end + ret = Hash.new + @@bot.httputil.get_response(doc) { |resp| + ret = Utils.get_resp_html_info(resp, opts) + } + return ret + else + raise + end + end + + class ::UrlLinkError < RuntimeError + end + + # This method extracts title, content (first par) and extra + # information from the given Net::HTTPResponse _resp_. + # + # Currently, the only accepted option (in _opts_) is + # uri_fragment:: the URI fragment of the original request + # + # Returns a Hash with the following keys: + # title:: the title of the document (if any) + # content:: the first paragraph of the document (if any) + # headers:: + # the headers of the Net::HTTPResponse. The value is + # a Hash whose keys are lowercase forms of the HTTP + # header fields, and whose values are Arrays. + # + def Utils.get_resp_html_info(resp, opts={}) + ret = Hash.new + case resp + when Net::HTTPSuccess + ret[:headers] = resp.to_hash + + if resp['content-type'] =~ /^text\/|(?:x|ht)ml/ + partial = resp.partial_body(@@bot.config['http.info_bytes']) + ret.merge!(Utils.get_string_html_info(partial, opts)) + end + return ret + else + raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})" + end + end + + # This method extracts title and content (first par) + # from the given HTML or XML document _text_, using + # standard methods (String#ircify_html_title, + # Utils.ircify_first_html_par) + # + # Currently, the only accepted option (in _opts_) is + # uri_fragment:: the URI fragment of the original request + # + def Utils.get_string_html_info(text, opts={}) + txt = text.dup + title = txt.ircify_html_title + if frag = opts[:uri_fragment] and not frag.empty? + fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im + txt.sub!(fragreg,'') + end + content = Utils.ircify_first_html_par(txt, :strip => title) + content = nil if content.empty? + return {:title => title, :content => content} + end + # Get the first pars of the first _count_ _urls_. # The pages are downloaded using the bot httputil service. # Returns an array of the first paragraphs fetched. |