]> git.netwichtig.de Git - user/henk/code/ruby/rbot.git/blobdiff - data/rbot/plugins/url.rb
lart plugin: replace "me" with sourcenick
[user/henk/code/ruby/rbot.git] / data / rbot / plugins / url.rb
index e3cecb6b0d5805654c4ff0fe831928dc5bba0566..e1c9b47396e3bd2a782f47197c4fa64d38af54a9 100644 (file)
@@ -5,11 +5,7 @@
 
 define_structure :Url, :channel, :nick, :time, :url, :info
 
-class ::UrlLinkError < RuntimeError
-end
-
 class UrlPlugin < Plugin
-  TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
   LINK_INFO = "[Link Info]"
   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
 
@@ -49,98 +45,64 @@ class UrlPlugin < Plugin
   end
 
   def help(plugin, topic="")
-    "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
+    "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
   end
 
   def get_title_from_html(pagedata)
-    return unless TITLE_RE.match(pagedata)
-    $1.ircify_html
+    return pagedata.ircify_html_title
   end
 
-  def get_title_for_url(uri_str, nick = nil, channel = nil, ircline = nil)
+  def get_title_for_url(uri_str, opts = {})
 
     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
     return if url.scheme !~ /https?/
 
-    if url.host =~ @no_info_hosts
-      return "Sorry, info retrieval for #{url.host} is disabled"
+    # also check the ip, the canonical name and the aliases
+    begin
+      checks = TCPSocket.gethostbyname(url.host)
+      checks.delete_at(-2)
+    rescue => e
+      return "Unable to retrieve info for #{url.host}: #{e.message}"
     end
 
-    logopts = Hash.new
-    logopts[:nick] = nick if nick
-    logopts[:channel] = channel if channel
-    logopts[:ircline] = ircline if ircline
+    checks << url.host
+    checks.flatten!
+
+    unless checks.grep(@no_info_hosts).empty?
+      return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
+    end
+
+    logopts = opts.dup
 
     title = nil
-    extra = String.new
+    extra = []
 
     begin
-      debug "+ getting #{url.request_uri}"
-      @bot.httputil.get_response(url) { |resp|
-        case resp
-        when Net::HTTPSuccess
-
-          debug resp.to_hash
-
-          if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
-            # The page is text or HTML, so we can try finding a title and, if
-            # requested, the first par.
-            #
-            # We act differently depending on whether we want the first par or
-            # not: in the first case we download the initial part and the parse
-            # it; in the second case we only download as much as we need to find
-            # the title
-            #
-            if @bot.config['url.first_par']
-              partial = resp.partial_body(@bot.config['http.info_bytes'])
-              logopts[:title] = title = get_title_from_html(partial)
-              if url.fragment and not url.fragment.empty?
-                fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
-                partial.sub!(fragreg,'')
-              end
-              first_par = Utils.ircify_first_html_par(partial, :strip => title)
-              unless first_par.empty?
-                logopts[:extra] = first_par
-                extra << ", #{Bold}text#{Bold}: #{first_par}"
-              end
-              call_event(:url_added, url.to_s, logopts)
-              return "#{Bold}title#{Bold}: #{title}#{extra}" if title
-            else
-              resp.partial_body(@bot.config['http.info_bytes']) { |part|
-                logopts[:title] = title = get_title_from_html(part)
-                call_event(:url_added, url.to_s, logopts)
-                return "#{Bold}title#{Bold}: #{title}" if title
-              }
-            end
-          # if nothing was found, provide more basic info, as for non-html pages
-          else
-            resp.no_cache = true
-          end
+      debug "+ getting info for #{url.request_uri}"
+      info = Utils.get_html_info(url)
+      debug info
+      resp = info[:headers]
 
-          enc = resp['content-encoding']
-          logopts[:extra] = String.new
-          logopts[:extra] << "Content Type: #{resp['content-type']}"
-          if enc
-            logopts[:extra] << ", encoding: #{enc}"
-            extra << ", #{Bold}encoding#{Bold}: #{enc}"
-          end
+      logopts[:title] = title = info[:title]
 
-          unless @bot.config['url.titles_only']
-            # content doesn't have title, just display info.
-            size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
-            if size
-              logopts[:extra] << ", size: #{size} bytes"
-              size = ", #{Bold}size#{Bold}: #{size} bytes"
-            end
-            call_event(:url_added, url.to_s, logopts)
-            return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
-          end
-          call_event(:url_added, url.to_s, logopts)
-        else
-          raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+      if info[:content]
+        logopts[:extra] = info[:content]
+        extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
+      else
+        logopts[:extra] = String.new
+        logopts[:extra] << "Content Type: #{resp['content-type']}"
+        extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
+        if enc = resp['content-encoding']
+          logopts[:extra] << ", encoding: #{enc}"
+          extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
         end
-      }
-      return nil
+
+        size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
+        if size
+          logopts[:extra] << ", size: #{size} bytes"
+          extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
+        end
+      end
     rescue Exception => e
       case e
       when UrlLinkError
@@ -150,37 +112,46 @@ class UrlPlugin < Plugin
         raise "connecting to site/processing information (#{e.message})"
       end
     end
+
+    call_event(:url_added, url.to_s, logopts)
+    if title
+      extra.unshift("#{Bold}title#{Bold}: #{title}")
+    end
+    return extra.join(", ") if title or not @bot.config['url.titles_only']
   end
 
   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
     return if urls.empty?
     debug "found urls #{urls.inspect}"
-    if m.public?
-      list = @registry[m.target] 
-    else
-      list = nil
-    end
+    list = m.public? ? @registry[m.target] : nil
+    debug "display link info: #{display_info}"
     urls_displayed = 0
-    urls.each { |urlstr|
+    urls.each do |urlstr|
       debug "working on #{urlstr}"
       next unless urlstr =~ /^https?:/
       title = nil
-      debug "display link info: #{display_info}"
+      debug "Getting title for #{urlstr}..."
+      reply = nil
+      begin
+        title = get_title_for_url(urlstr,
+                                  :nick => m.source.nick,
+                                  :channel => m.channel,
+                                  :ircline => m.message)
+        debug "Title #{title ? '' : 'not '} found"
+        reply = "#{LINK_INFO} #{title}" if title
+      rescue => e
+        if e.message =~ /\(404 - Not Found\)/i
+          # see if we failed to find the thing because of trailing punctuation
+          # but check that we still have 'something' in the URL
+          retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
+        end
+        reply = "Error #{e.message}"
+      end
+
       if display_info > urls_displayed
-        urls_displayed += 1
-        Thread.start do
-          debug "Getting title for #{urlstr}..."
-          begin
-            title = get_title_for_url urlstr, m.source.nick, m.channel, m.message
-            if title
-              m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
-              debug "Title found!"
-            else
-              debug "Title not found!"
-            end
-          rescue => e
-            m.reply "Error #{e.message}"
-          end
+        if reply
+          m.plainreply(reply, :overlong => :truncate)
+          urls_displayed += 1
         end
       end
 
@@ -191,20 +162,18 @@ class UrlPlugin < Plugin
 
       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
       debug "#{list.length} urls so far"
-      if list.length > @bot.config['url.max_urls']
-        list.pop
-      end
+      list.pop if list.length > @bot.config['url.max_urls']
       debug "storing url #{url.url}"
       list.unshift url
       debug "#{list.length} urls now"
-    }
+    end
     @registry[m.target] = list
   end
 
   def info(m, params)
     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
     urls = URI.extract(escaped)
-    handle_urls(m, urls, params[:urls].length)
+    Thread.new { handle_urls(m, urls, params[:urls].length) }
   end
 
   def listen(m)
@@ -212,8 +181,9 @@ class UrlPlugin < Plugin
     return if m.address?
 
     escaped = URI.escape(m.message, OUR_UNSAFE)
-    urls = URI.extract(escaped)
-    handle_urls(m, urls)
+    urls = URI.extract(escaped, ['http', 'https'])
+    return if urls.empty?
+    Thread.new { handle_urls(m, urls) }
   end
 
   def reply_urls(opts={})
@@ -225,7 +195,9 @@ class UrlPlugin < Plugin
     list[0..(max-1)].each do |url|
       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
       if @bot.config['url.info_on_list']
-        title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
+        title = url.info ||
+          get_title_for_url(url.url,
+                            :nick => url.nick, :channel => channel) rescue nil
         # If the url info was missing and we now have some, try to upgrade it
         if channel and title and not url.info
           ll = @registry[channel]
@@ -275,6 +247,7 @@ end
 
 plugin = UrlPlugin.new
 plugin.map 'urls info *urls', :action => 'info'
+plugin.map 'url info *urls', :action => 'info'
 plugin.map 'urls search :channel :limit :string', :action => 'search',
                           :defaults => {:limit => 4},
                           :requirements => {:limit => /^\d+$/},