HTML processing refactoring: Utils.get_html_info and related methods factored out of the url plugin

author: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-09-18 06:15:49 +0000
committer: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-09-18 06:15:49 +0000
commit: 0079ca3ac1adb77e4ddc9ebd34149c60d73b7529 (patch)
tree: 13ef3434003aaa88dec2a015fd403dabec7e952c
parent: 83cb5754e808afc77ac625ef66e5ff128ee00d4f (diff)
2 files changed, 113 insertions, 66 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb
index 0809288f..7a752ec6 100644
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -5,9 +5,6 @@
 
 define_structure :Url, :channel, :nick, :time, :url, :info
 
-class ::UrlLinkError < RuntimeError
-end
-
 class UrlPlugin < Plugin
   LINK_INFO = "[Link Info]"
   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
@@ -67,75 +64,34 @@ class UrlPlugin < Plugin
     logopts = opts.dup
 
     title = nil
-    extra = String.new
+    extra = []
 
     begin
-      debug "+ getting #{url.request_uri}"
-      @bot.httputil.get_response(url) { |resp|
-        case resp
-        when Net::HTTPSuccess
-
-          debug resp.to_hash
+      debug "+ getting info for #{url.request_uri}"
+      info = Utils.get_html_info(url)
+      debug info
+      resp = info[:headers]
 
-          if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
-            # The page is text or HTML, so we can try finding a title and, if
-            # requested, the first par.
-            #
-            # We act differently depending on whether we want the first par or
-            # not: in the first case we download the initial part and the parse
-            # it; in the second case we only download as much as we need to find
-            # the title
-            #
-            if @bot.config['url.first_par']
-              partial = resp.partial_body(@bot.config['http.info_bytes'])
-              logopts[:title] = title = get_title_from_html(partial)
-              if url.fragment and not url.fragment.empty?
-                fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
-                partial.sub!(fragreg,'')
-              end
-              first_par = Utils.ircify_first_html_par(partial, :strip => title)
-              unless first_par.empty?
-                logopts[:extra] = first_par
-                extra << ", #{Bold}text#{Bold}: #{first_par}"
-              end
-              call_event(:url_added, url.to_s, logopts)
-              return "#{Bold}title#{Bold}: #{title}#{extra}" if title
-            else
-              resp.partial_body(@bot.config['http.info_bytes']) { |part|
-                logopts[:title] = title = get_title_from_html(part)
-                call_event(:url_added, url.to_s, logopts)
-                return "#{Bold}title#{Bold}: #{title}" if title
-              }
-            end
-          # if nothing was found, provide more basic info, as for non-html pages
-          else
-            resp.no_cache = true
-          end
+      logopts[:title] = title = info[:title]
 
-          enc = resp['content-encoding']
-          logopts[:extra] = String.new
-          logopts[:extra] << "Content Type: #{resp['content-type']}"
-          if enc
-            logopts[:extra] << ", encoding: #{enc}"
-            extra << ", #{Bold}encoding#{Bold}: #{enc}"
-          end
+      if info[:content]
+        logopts[:extra] = info[:content]
+        extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
+      else
+        logopts[:extra] = String.new
+        logopts[:extra] << "Content Type: #{resp['content-type']}"
+        extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
+        if enc = resp['content-encoding']
+          logopts[:extra] << ", encoding: #{enc}"
+          extra << "#{Bold}encoding#{Bold}: #{enc}"
+        end
 
-          unless @bot.config['url.titles_only']
-            # content doesn't have title, just display info.
-            size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
-            if size
-              logopts[:extra] << ", size: #{size} bytes"
-              size = ", #{Bold}size#{Bold}: #{size} bytes"
-            end
-            call_event(:url_added, url.to_s, logopts)
-            return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
-          end
-          call_event(:url_added, url.to_s, logopts)
-        else
-          raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+        size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
+        if size
+          logopts[:extra] << ", size: #{size} bytes"
+          extra << "#{Bold}size#{Bold}: #{size} bytes"
         end
-      }
-      return nil
+      end
     rescue Exception => e
       case e
       when UrlLinkError
@@ -145,6 +101,12 @@ class UrlPlugin < Plugin
         raise "connecting to site/processing information (#{e.message})"
       end
     end
+
+    call_event(:url_added, url.to_s, logopts)
+    if title
+      extra.unshift("#{Bold}title#{Bold}: #{title}")
+    end
+    return extra.join(", ") if title or not @bot.config['url.titles_only']
   end
 
   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 0b10b52f..8c23b2cf 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -656,6 +656,91 @@ module ::Irc
       end
     end
 
+    # This method extracts title, content (first par) and extra
+    # information from the given document _doc_.
+    #
+    # _doc_ can be an URI, a Net::HTTPResponse or a String.
+    #
+    # If _doc_ is a String, only title and content information
+    # are retrieved (if possible), using standard methods.
+    #
+    # If _doc_ is an URI or a Net::HTTPResponse, additional
+    # information is retrieved, and special title/summary
+    # extraction routines are used if possible.
+    #
+    def Utils.get_html_info(doc, opts={})
+      case doc
+      when String
+        Utils.get_string_html_info(doc, opts)
+      when Net::HTTPResponse
+        Utils.get_resp_html_info(doc, opts)
+      when URI
+        if doc.fragment and not doc.fragment.empty?
+          opts[:uri_fragment] ||= doc.fragment
+        end
+        ret = Hash.new
+        @@bot.httputil.get_response(doc) { |resp|
+          ret = Utils.get_resp_html_info(resp, opts)
+        }
+        return ret
+      else
+        raise
+      end
+    end
+
+    class ::UrlLinkError < RuntimeError
+    end
+
+    # This method extracts title, content (first par) and extra
+    # information from the given Net::HTTPResponse _resp_.
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    # Returns a Hash with the following keys:
+    # title:: the title of the document (if any)
+    # content:: the first paragraph of the document (if any)
+    # headers::
+    #   the headers of the Net::HTTPResponse. The value is
+    #   a Hash whose keys are lowercase forms of the HTTP
+    #   header fields, and whose values are Arrays.
+    #
+    def Utils.get_resp_html_info(resp, opts={})
+      ret = Hash.new
+      case resp
+      when Net::HTTPSuccess
+        ret[:headers] = resp.to_hash
+
+        if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+          partial = resp.partial_body(@@bot.config['http.info_bytes'])
+          ret.merge!(Utils.get_string_html_info(partial, opts))
+        end
+        return ret
+      else
+        raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+      end
+    end
+
+    # This method extracts title and content (first par)
+    # from the given HTML or XML document _text_, using
+    # standard methods (String#ircify_html_title,
+    # Utils.ircify_first_html_par)
+    #
+    # Currently, the only accepted option (in _opts_) is
+    # uri_fragment:: the URI fragment of the original request
+    #
+    def Utils.get_string_html_info(text, opts={})
+      txt = text.dup
+      title = txt.ircify_html_title
+      if frag = opts[:uri_fragment] and not frag.empty?
+        fragreg = /.*?<a\s+[^>]*name=["']?#{frag}["']?.*?>/im
+        txt.sub!(fragreg,'')
+      end
+      content = Utils.ircify_first_html_par(txt, :strip => title)
+      content = nil if content.empty?
+      return {:title => title, :content => content}
+    end
+
     # Get the first pars of the first _count_ _urls_.
     # The pages are downloaded using the bot httputil service.
     # Returns an array of the first paragraphs fetched.
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-09-18 06:15:49 +0000
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-09-18 06:15:49 +0000
commit	0079ca3ac1adb77e4ddc9ebd34149c60d73b7529 (patch)
tree	13ef3434003aaa88dec2a015fd403dabec7e952c
parent	83cb5754e808afc77ac625ef66e5ff128ee00d4f (diff)