Changed the way the URL grabber gets urls. Instead of using HEAD, it uses GET, but only grabs the first 50k of the page to check it for a header.

author: Chris Gahan <chris@ill-logic.com> 2006-02-09 18:20:55 +0000
committer: Chris Gahan <chris@ill-logic.com> 2006-02-09 18:20:55 +0000
commit: 5720064dde5ff1205bf072ffe01e7ab070b2152a (patch)
tree: 8a003f47697e92bc8d5da4c54aaea483e7745344
parent: 345df89e4d04c89c7cd43e21e918bf0b83bb1205 (diff)
1 files changed, 60 insertions, 31 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb
index 396c5ef2..858b5a05 100644
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -312,6 +312,31 @@ class UrlPlugin < Plugin
     title = title[0..255] if title.length > 255
     "[Link Info] title: #{title}"
   end
+
+  def read_data_from_response(response, amount)
+    
+    amount_read = 0
+    chunks = []
+    
+    response.read_body do |chunk|   # read body now
+      
+      amount_read += chunk.length
+      
+      if amount_read > amount
+        amount_of_overflow = amount_read - amount
+        chunk = chunk[0...-amount_of_overflow]
+      end
+      
+      chunks << chunk
+
+      break if amount_read >= amount
+      
+    end
+    
+    chunks.join('')
+    
+  end
+
 
   def get_title_for_url(uri_str, depth=10)
     # This god-awful mess is what the ruby http library has reduced me to.
@@ -326,37 +351,41 @@ class UrlPlugin < Plugin
     return if url.scheme !~ /https?/
     
     puts "+ connecting to #{url.host}:#{url.port}"
-    http = @bot.httputil.get_proxy(url) 
-    title = http.start do |http|
-      url.path = '/' if url.path == ''
-      head = http.request_head(url.path)
-      case head
-        when Net::HTTPRedirection then
-          # call self recursively if this is a redirect
-          redirect_to = head['location']
-          puts "+ redirect location: #{redirect_to}"
-          url = URI.join url.to_s, redirect_to
-          puts "+ whee, redirecting to #{url.to_s}!"
-          title = get_title_for_url(url.to_s, depth-1)
-        when Net::HTTPSuccess then
-          if head['content-type'] =~ /^text\// and (not head['content-length'] or head['content-length'].to_i < 400000)
-            # since the content is 'text/*' and is small enough to
-            # be a webpage, retrieve the title from the page
-            puts "+ getting #{url.request_uri}"
-            response = http.request_get(url.request_uri)
-            return get_title_from_html(response.body)
-          else
-            # content doesn't have title, just display info.
-            size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
-            #lastmod = head['last-modified']
-            return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}"
-          end
-        when Net::HTTPClientError then
-          return "[Link Info] Error getting link (#{head.code} - #{head.message})"
-        when Net::HTTPServerError then
-          return "[Link Info] Error getting link (#{head.code} - #{head.message})"
-      end
-    end
+    http = @bot.httputil.get_proxy(url)
+    title = http.start { |http|
+      url.path = '/' if url.path == ''
+
+      http.request_get(url.path) { |response|
+        
+        case response
+          when Net::HTTPRedirection then
+            # call self recursively if this is a redirect
+            redirect_to = response['location']
+            puts "+ redirect location: #{redirect_to}"
+            url = URI.join url.to_s, redirect_to
+            puts "+ whee, redirecting to #{url.to_s}!"
+            title = get_title_for_url(url.to_s, depth-1)
+          when Net::HTTPSuccess then
+            if response['content-type'] =~ /^text\//
+              # since the content is 'text/*' and is small enough to
+              # be a webpage, retrieve the title from the page
+              puts "+ getting #{url.request_uri}"
+              data = read_data_from_response(response, 50000)
+              return get_title_from_html(data)
+            else
+              # content doesn't have title, just display info.
+              size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
+              return "[Link Info] type: #{response['content-type']}#{size ? ", size: #{size} bytes" : ""}"
+            end
+          when Net::HTTPClientError then
+            return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+          when Net::HTTPServerError then
+            return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+        end # end of "case response"
+          
+      } # end of request block
+    } # end of http start block
+    
   rescue SocketError => e
     return "[Link Info] Error connecting to site (#{e.message})"
   end
author	Chris Gahan <chris@ill-logic.com>	2006-02-09 18:20:55 +0000
committer	Chris Gahan <chris@ill-logic.com>	2006-02-09 18:20:55 +0000
commit	5720064dde5ff1205bf072ffe01e7ab070b2152a (patch)
tree	8a003f47697e92bc8d5da4c54aaea483e7745344
parent	345df89e4d04c89c7cd43e21e918bf0b83bb1205 (diff)