A fixed chuck norris plugin. (The XML feed was disabled, so this one just scrapes...

[user/henk/code/ruby/rbot.git] / data / rbot / plugins / url.rb
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb

index 396c5ef219ccc7db8deb0b61782500ed1a30a4f0..1e72a3a14f4a6be39e980576caa632253c9ed3ed 100644 (file)
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -312,6 +312,31 @@ class UrlPlugin < Plugin
      title = title[0..255] if title.length > 255
      "[Link Info] title: #{title}"
    end
+\r
+  def read_data_from_response(response, amount)\r
+    \r
+    amount_read = 0\r
+    chunks = []\r
+    \r
+    response.read_body do |chunk|   # read body now\r
+      \r
+      amount_read += chunk.length\r
+      \r
+      if amount_read > amount\r
+        amount_of_overflow = amount_read - amount\r
+        chunk = chunk[0...-amount_of_overflow]\r
+      end\r
+      \r
+      chunks << chunk\r
+\r
+      break if amount_read >= amount\r
+      \r
+    end\r
+    \r
+    chunks.join('')\r
+    \r
+  end\r
+\r
  
    def get_title_for_url(uri_str, depth=10)
      # This god-awful mess is what the ruby http library has reduced me to.
@@ -326,37 +351,41 @@ class UrlPlugin < Plugin
      return if url.scheme !~ /https?/
      
      puts "+ connecting to #{url.host}:#{url.port}"
-    http = @bot.httputil.get_proxy(url) 
-    title = http.start do |http|
-      url.path = '/' if url.path == ''
-      head = http.request_head(url.path)
-      case head
-        when Net::HTTPRedirection then
-          # call self recursively if this is a redirect
-          redirect_to = head['location']
-          puts "+ redirect location: #{redirect_to}"
-          url = URI.join url.to_s, redirect_to
-          puts "+ whee, redirecting to #{url.to_s}!"
-          title = get_title_for_url(url.to_s, depth-1)
-        when Net::HTTPSuccess then
-          if head['content-type'] =~ /^text\// and (not head['content-length'] or head['content-length'].to_i < 400000)
-            # since the content is 'text/*' and is small enough to
-            # be a webpage, retrieve the title from the page
-            puts "+ getting #{url.request_uri}"
-            response = http.request_get(url.request_uri)
-            return get_title_from_html(response.body)
-          else
-            # content doesn't have title, just display info.
-            size = head['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
-            #lastmod = head['last-modified']
-            return "[Link Info] type: #{head['content-type']}#{size ? ", size: #{size} bytes" : ""}"
-          end
-        when Net::HTTPClientError then
-          return "[Link Info] Error getting link (#{head.code} - #{head.message})"
-        when Net::HTTPServerError then
-          return "[Link Info] Error getting link (#{head.code} - #{head.message})"
-      end
-    end
+    http = @bot.httputil.get_proxy(url)
+    title = http.start { |http|
+      url.path = '/' if url.path == ''\r
+\r
+      http.request_get(url.path, "User-Agent" => "rbot-url_plugin/666.666") { |response|\r
+        
+        case response
+          when Net::HTTPRedirection then
+            # call self recursively if this is a redirect
+            redirect_to = response['location']  || './'
+            puts "+ redirect location: #{redirect_to.inspect}"
+            url = URI.join url.to_s, redirect_to
+            puts "+ whee, redirecting to #{url.to_s}!"
+            title = get_title_for_url(url.to_s, depth-1)
+          when Net::HTTPSuccess then
+            if response['content-type'] =~ /^text\//
+              # since the content is 'text/*' and is small enough to
+              # be a webpage, retrieve the title from the page
+              puts "+ getting #{url.request_uri}"\r
+              data = read_data_from_response(response, 50000)\r
+              return get_title_from_html(data)
+            else
+              # content doesn't have title, just display info.
+              size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
+              return "[Link Info] type: #{response['content-type']}#{size ? ", size: #{size} bytes" : ""}"
+            end
+          when Net::HTTPClientError then
+            return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+          when Net::HTTPServerError then
+            return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+        end # end of "case response"\r
+          \r
+      } # end of request block
+    } # end of http start block\r
+    
    rescue SocketError => e
      return "[Link Info] Error connecting to site (#{e.message})"
    end