added 'alias rm' as alternative for 'alias remove'

[user/henk/code/ruby/rbot.git] / data / rbot / plugins / url.rb
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb

index 65d75eabd098fffa4bbd51c922b5c665e0efa8d0..269227a3c468e21d643d83e8f66296be5e6128fb 100644 (file)
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -1,283 +1,29 @@
-require 'net/http'
-require 'uri'
-require 'cgi'
+define_structure :Url, :channel, :nick, :time, :url, :info
  
-Url = Struct.new("Url", :channel, :nick, :time, :url) unless defined?(Url)
-TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
-
-UNESCAPE_TABLE = {
-    'raquo' => '>>',
-    'quot' => '"',
-    'micro' => 'u',
-    'copy' => '(c)',
-    'trade' => '(tm)',
-    'reg' => '(R)',
-    '#174' => '(R)',
-    '#8220' => '"',
-    '#8221' => '"',
-    '#8212' => '--',
-    '#39' => '\'',
-=begin
-    # extras codes, for future use...
-    'zwnj' => '&#8204;',
-    'aring' => '\xe5',
-    'gt' => '>',
-    'yen' => '\xa5',
-    'ograve' => '\xf2',
-    'Chi' => '&#935;',
-    'bull' => '&#8226;',
-    'Egrave' => '\xc8',
-    'Ntilde' => '\xd1',
-    'upsih' => '&#978;',
-    'Yacute' => '\xdd',
-    'asymp' => '&#8776;',
-    'radic' => '&#8730;',
-    'otimes' => '&#8855;',
-    'nabla' => '&#8711;',
-    'aelig' => '\xe6',
-    'oelig' => '&#339;',
-    'equiv' => '&#8801;',
-    'Psi' => '&#936;',
-    'auml' => '\xe4',
-    'circ' => '&#710;',
-    'Acirc' => '\xc2',
-    'Epsilon' => '&#917;',
-    'Yuml' => '&#376;',
-    'Eta' => '&#919;',
-    'lt' => '<',
-    'Icirc' => '\xce',
-    'Upsilon' => '&#933;',
-    'ndash' => '&#8211;',
-    'there4' => '&#8756;',
-    'Prime' => '&#8243;',
-    'prime' => '&#8242;',
-    'psi' => '&#968;',
-    'Kappa' => '&#922;',
-    'rsaquo' => '&#8250;',
-    'Tau' => '&#932;',
-    'darr' => '&#8595;',
-    'ocirc' => '\xf4',
-    'lrm' => '&#8206;',
-    'zwj' => '&#8205;',
-    'cedil' => '\xb8',
-    'Ecirc' => '\xca',
-    'not' => '\xac',
-    'amp' => '&',
-    'AElig' => '\xc6',
-    'oslash' => '\xf8',
-    'acute' => '\xb4',
-    'lceil' => '&#8968;',
-    'laquo' => '\xab',
-    'shy' => '\xad',
-    'rdquo' => '&#8221;',
-    'ge' => '&#8805;',
-    'Igrave' => '\xcc',
-    'Ograve' => '\xd2',
-    'euro' => '&#8364;',
-    'dArr' => '&#8659;',
-    'sdot' => '&#8901;',
-    'nbsp' => '\xa0',
-    'lfloor' => '&#8970;',
-    'lArr' => '&#8656;',
-    'Auml' => '\xc4',
-    'larr' => '&#8592;',
-    'Atilde' => '\xc3',
-    'Otilde' => '\xd5',
-    'szlig' => '\xdf',
-    'clubs' => '&#9827;',
-    'diams' => '&#9830;',
-    'agrave' => '\xe0',
-    'Ocirc' => '\xd4',
-    'Iota' => '&#921;',
-    'Theta' => '&#920;',
-    'Pi' => '&#928;',
-    'OElig' => '&#338;',
-    'Scaron' => '&#352;',
-    'frac14' => '\xbc',
-    'egrave' => '\xe8',
-    'sub' => '&#8834;',
-    'iexcl' => '\xa1',
-    'frac12' => '\xbd',
-    'sbquo' => '&#8218;',
-    'ordf' => '\xaa',
-    'sum' => '&#8721;',
-    'prop' => '&#8733;',
-    'Uuml' => '\xdc',
-    'ntilde' => '\xf1',
-    'sup' => '&#8835;',
-    'theta' => '&#952;',
-    'prod' => '&#8719;',
-    'nsub' => '&#8836;',
-    'hArr' => '&#8660;',
-    'rlm' => '&#8207;',
-    'THORN' => '\xde',
-    'infin' => '&#8734;',
-    'yuml' => '\xff',
-    'Mu' => '&#924;',
-    'le' => '&#8804;',
-    'Eacute' => '\xc9',
-    'thinsp' => '&#8201;',
-    'ecirc' => '\xea',
-    'bdquo' => '&#8222;',
-    'Sigma' => '&#931;',
-    'fnof' => '&#402;',
-    'Aring' => '\xc5',
-    'tilde' => '&#732;',
-    'frac34' => '\xbe',
-    'emsp' => '&#8195;',
-    'mdash' => '&#8212;',
-    'uarr' => '&#8593;',
-    'permil' => '&#8240;',
-    'Ugrave' => '\xd9',
-    'rarr' => '&#8594;',
-    'Agrave' => '\xc0',
-    'chi' => '&#967;',
-    'forall' => '&#8704;',
-    'eth' => '\xf0',
-    'rceil' => '&#8969;',
-    'iuml' => '\xef',
-    'gamma' => '&#947;',
-    'lambda' => '&#955;',
-    'harr' => '&#8596;',
-    'rang' => '&#9002;',
-    'xi' => '&#958;',
-    'dagger' => '&#8224;',
-    'divide' => '\xf7',
-    'Ouml' => '\xd6',
-    'image' => '&#8465;',
-    'alefsym' => '&#8501;',
-    'igrave' => '\xec',
-    'otilde' => '\xf5',
-    'Oacute' => '\xd3',
-    'sube' => '&#8838;',
-    'alpha' => '&#945;',
-    'frasl' => '&#8260;',
-    'ETH' => '\xd0',
-    'lowast' => '&#8727;',
-    'Nu' => '&#925;',
-    'plusmn' => '\xb1',
-    'Euml' => '\xcb',
-    'real' => '&#8476;',
-    'sup1' => '\xb9',
-    'sup2' => '\xb2',
-    'sup3' => '\xb3',
-    'Oslash' => '\xd8',
-    'Aacute' => '\xc1',
-    'cent' => '\xa2',
-    'oline' => '&#8254;',
-    'Beta' => '&#914;',
-    'perp' => '&#8869;',
-    'Delta' => '&#916;',
-    'loz' => '&#9674;',
-    'pi' => '&#960;',
-    'iota' => '&#953;',
-    'empty' => '&#8709;',
-    'euml' => '\xeb',
-    'brvbar' => '\xa6',
-    'iacute' => '\xed',
-    'para' => '\xb6',
-    'micro' => '\xb5',
-    'cup' => '&#8746;',
-    'weierp' => '&#8472;',
-    'uuml' => '\xfc',
-    'part' => '&#8706;',
-    'icirc' => '\xee',
-    'delta' => '&#948;',
-    'omicron' => '&#959;',
-    'upsilon' => '&#965;',
-    'Iuml' => '\xcf',
-    'Lambda' => '&#923;',
-    'Xi' => '&#926;',
-    'kappa' => '&#954;',
-    'ccedil' => '\xe7',
-    'Ucirc' => '\xdb',
-    'cap' => '&#8745;',
-    'mu' => '&#956;',
-    'scaron' => '&#353;',
-    'lsquo' => '&#8216;',
-    'isin' => '&#8712;',
-    'Zeta' => '&#918;',
-    'supe' => '&#8839;',
-    'deg' => '\xb0',
-    'and' => '&#8743;',
-    'tau' => '&#964;',
-    'pound' => '\xa3',
-    'hellip' => '&#8230;',
-    'curren' => '\xa4',
-    'int' => '&#8747;',
-    'ucirc' => '\xfb',
-    'rfloor' => '&#8971;',
-    'ensp' => '&#8194;',
-    'crarr' => '&#8629;',
-    'ugrave' => '\xf9',
-    'notin' => '&#8713;',
-    'exist' => '&#8707;',
-    'uArr' => '&#8657;',
-    'cong' => '&#8773;',
-    'Dagger' => '&#8225;',
-    'oplus' => '&#8853;',
-    'times' => '\xd7',
-    'atilde' => '\xe3',
-    'piv' => '&#982;',
-    'ni' => '&#8715;',
-    'Phi' => '&#934;',
-    'lsaquo' => '&#8249;',
-    'Uacute' => '\xda',
-    'Omicron' => '&#927;',
-    'ang' => '&#8736;',
-    'ne' => '&#8800;',
-    'iquest' => '\xbf',
-    'eta' => '&#951;',
-    'yacute' => '\xfd',
-    'Rho' => '&#929;',
-    'uacute' => '\xfa',
-    'Alpha' => '&#913;',
-    'zeta' => '&#950;',
-    'Omega' => '&#937;',
-    'nu' => '&#957;',
-    'sim' => '&#8764;',
-    'sect' => '\xa7',
-    'phi' => '&#966;',
-    'sigmaf' => '&#962;',
-    'macr' => '\xaf',
-    'minus' => '&#8722;',
-    'Ccedil' => '\xc7',
-    'ordm' => '\xba',
-    'epsilon' => '&#949;',
-    'beta' => '&#946;',
-    'rArr' => '&#8658;',
-    'rho' => '&#961;',
-    'aacute' => '\xe1',
-    'eacute' => '\xe9',
-    'omega' => '&#969;',
-    'middot' => '\xb7',
-    'Gamma' => '&#915;',
-    'Iacute' => '\xcd',
-    'lang' => '&#9001;',
-    'spades' => '&#9824;',
-    'rsquo' => '&#8217;',
-    'uml' => '\xa8',
-    'thorn' => '\xfe',
-    'ouml' => '\xf6',
-    'thetasym' => '&#977;',
-    'or' => '&#8744;',
-    'raquo' => '\xbb',
-    'acirc' => '\xe2',
-    'ldquo' => '&#8220;',
-    'hearts' => '&#9829;',
-    'sigma' => '&#963;',
-    'oacute' => '\xf3',
-=end
-}
+class ::UrlLinkError < RuntimeError
+end
  
  class UrlPlugin < Plugin
+  TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
+  LINK_INFO = "[Link Info]"
+
    BotConfig.register BotConfigIntegerValue.new('url.max_urls',
      :default => 100, :validate => Proc.new{|v| v > 0},
      :desc => "Maximum number of urls to store. New urls replace oldest ones.")
    BotConfig.register BotConfigBooleanValue.new('url.display_link_info',
-    :default => false, 
+    :default => false,
      :desc => "Get the title of any links pasted to the channel and display it (also tells if the link is broken or the site is down)")
-  
+  BotConfig.register BotConfigBooleanValue.new('url.titles_only',
+    :default => false,
+    :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
+  BotConfig.register BotConfigBooleanValue.new('url.first_par',
+    :default => false,
+    :desc => "Also try to get the first paragraph of a web page")
+  BotConfig.register BotConfigBooleanValue.new('url.info_on_list',
+    :default => false,
+    :desc => "Show link info when listing/searching for urls")
+
+
    def initialize
      super
      @registry.set_default(Array.new)
@@ -287,113 +33,95 @@ class UrlPlugin < Plugin
      "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
    end
  
-  def unescape_title(htmldata)
-    # first pass -- let CGI try to attack it...
-    htmldata = CGI::unescapeHTML htmldata
-    
-    # second pass -- destroy the remaining bits...
-    htmldata.gsub(/(&(.+?);)/) {
-        symbol = $2
-        
-        # remove the 0-paddng from unicode integers
-        if symbol =~ /#(.+)/
-            symbol = "##{$1.to_i.to_s}"
-        end
-        
-        # output the symbol's irc-translated character, or a * if it's unknown
-        UNESCAPE_TABLE[symbol] || '*'
-    }
-  end
-
    def get_title_from_html(pagedata)
      return unless TITLE_RE.match(pagedata)
-    title = $1.strip.gsub(/\s*\n+\s*/, " ")
-    title = unescape_title title
-    title = title[0..255] if title.length > 255
-    "[Link Info] title: #{title}"
+    $1.ircify_html
    end
  
-  def read_data_from_response(response, amount)
-    
-    amount_read = 0
-    chunks = []
-    
-    response.read_body do |chunk|   # read body now
-      
-      amount_read += chunk.length
-      
-      if amount_read > amount
-        amount_of_overflow = amount_read - amount
-        chunk = chunk[0...-amount_of_overflow]
-      end
-      
-      chunks << chunk
-
-      break if amount_read >= amount
-      
-    end
-    
-    chunks.join('')
-    
-  end
+  def get_title_for_url(uri_str, nick = nil, channel = nil)
  
-
-  def get_title_for_url(uri_str, depth=10)
-    # This god-awful mess is what the ruby http library has reduced me to.
-    # Python's HTTP lib is so much nicer. :~(
-    
-    if depth == 0
-        raise "Error: Maximum redirects hit."
-    end
-    
-    debug "+ Getting #{uri_str}"
-    url = URI.parse(uri_str)
+    url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
      return if url.scheme !~ /https?/
  
-    title = nil
-    
-    debug "+ connecting to #{url.host}:#{url.port}"
-    http = @bot.httputil.get_proxy(url)
-    http.start { |http|
-      url.path = '/' if url.path == ''
+    logopts = Hash.new
+    logopts[:nick] = nick if nick
+    logopts[:channel] = channel if channel
  
-      http.request_get(url.path, "User-Agent" => "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)") { |response|
-        
-        case response
-          when Net::HTTPRedirection, Net::HTTPMovedPermanently then
-            # call self recursively if this is a redirect
-            redirect_to = response['location']  || './'
-            debug "+ redirect location: #{redirect_to.inspect}"
-            url = URI.join url.to_s, redirect_to
-            debug "+ whee, redirecting to #{url.to_s}!"
-            return get_title_for_url(url.to_s, depth-1)
-          when Net::HTTPSuccess then
-            if response['content-type'] =~ /^text\//
-              # since the content is 'text/*' and is small enough to
-              # be a webpage, retrieve the title from the page
-              debug "+ getting #{url.request_uri}"
-              data = read_data_from_response(response, 50000)
-              return get_title_from_html(data)
+    title = nil
+    extra = String.new
+
+    begin
+      debug "+ getting #{url.request_uri}"
+      @bot.httputil.get_response(url) { |resp|
+        case resp
+        when Net::HTTPSuccess
+
+          debug resp.to_hash
+
+          if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+            # The page is text or HTML, so we can try finding a title and, if
+            # requested, the first par.
+            #
+            # We act differently depending on whether we want the first par or
+            # not: in the first case we download the initial part and the parse
+            # it; in the second case we only download as much as we need to find
+            # the title
+            #
+            if @bot.config['url.first_par']
+              partial = resp.partial_body(@bot.config['http.info_bytes'])
+              logopts[:title] = title = get_title_from_html(partial)
+              first_par = Utils.ircify_first_html_par(partial, :strip => title)
+              unless first_par.empty?
+                logopts[:extra] = first_par
+                extra << ", #{Bold}text#{Bold}: #{first_par}"
+              end
+              call_event(:url_added, url.to_s, logopts)
+              return "#{Bold}title#{Bold}: #{title}#{extra}" if title
              else
-              # content doesn't have title, just display info.
-              size = response['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2')
-              return "[Link Info] type: #{response['content-type']}#{size ? ", size: #{size} bytes" : ""}"
+              resp.partial_body(@bot.config['http.info_bytes']) { |part|
+                logopts[:title] = title = get_title_from_html(part)
+                call_event(:url_added, url.to_s, logopts)
+                return "#{Bold}title#{Bold}: #{title}" if title
+              }
              end
-          when Net::HTTPClientError then
-            return "[Link Info] Error getting link (#{response.code} - #{response.message})"
-          when Net::HTTPServerError then
-            return "[Link Info] Error getting link (#{response.code} - #{response.message})"
+          # if nothing was found, provide more basic info, as for non-html pages
            else
-            return nil
-        end # end of "case response"
-          
-      } # end of request block
-    } # end of http start block
-
-    return title
-    
-  rescue SocketError => e
-    return "[Link Info] Error connecting to site (#{e.message})"
+            resp.no_cache = true
+          end
+
+          enc = resp['content-encoding']
+          logopts[:extra] = String.new
+          logopts[:extra] << "Content Type: #{resp['content-type']}"
+          if enc
+            logopts[:extra] << ", encoding: #{enc}"
+            extra << ", #{Bold}encoding#{Bold}: #{enc}"
+          end
+
+          unless @bot.config['url.titles_only']
+            # content doesn't have title, just display info.
+            size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
+            if size
+              logopts[:extra] << ", size: #{size} bytes"
+              size = ", #{Bold}size#{Bold}: #{size} bytes"
+            end
+            call_event(:url_added, url.to_s, logopts)
+            return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
+          end
+          call_event(:url_added, url.to_s, logopts)
+        else
+          raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+        end
+      }
+      return nil
+    rescue Exception => e
+      case e
+      when UrlLinkError
+        raise e
+      else
+        error e
+        raise "connecting to site/processing information (#{e.message})"
+      end
+    end
    end
  
    def listen(m)
@@ -405,21 +133,28 @@ class UrlPlugin < Plugin
          urlstr = $1
          list = @registry[m.target]
  
+        title = nil
          if @bot.config['url.display_link_info']
-          debug "Getting title for #{urlstr}..."
-          title = get_title_for_url urlstr
-          if title
-            m.reply title
-            debug "Title found!"
-          else
-            debug "Title not found!"
-          end        
+          Thread.start do
+            debug "Getting title for #{urlstr}..."
+            begin
+              title = get_title_for_url urlstr, m.source.nick, m.channel
+              if title
+                m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
+                debug "Title found!"
+              else
+                debug "Title not found!"
+              end
+            rescue => e
+              m.reply "Error #{e.message}"
+            end
+          end
          end
-    
+
          # check to see if this url is already listed
          return if list.find {|u| u.url == urlstr }
-        
-        url = Url.new(m.target, m.sourcenick, Time.new, urlstr)
+
+        url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
          debug "#{list.length} urls so far"
          if list.length > @bot.config['url.max_urls']
            list.pop
@@ -432,6 +167,31 @@ class UrlPlugin < Plugin
      end
    end
  
+  def reply_urls(opts={})
+    list = opts[:list]
+    max = opts[:max]
+    channel = opts[:channel]
+    m = opts[:msg]
+    return unless list and max and m
+    list[0..(max-1)].each do |url|
+      disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
+      if @bot.config['url.info_on_list']
+        title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
+        # If the url info was missing and we now have some, try to upgrade it
+        if channel and title and not url.info
+          ll = @registry[channel]
+          debug ll
+          if el = ll.find { |u| u.url == url.url }
+            el.info = title
+            @registry[channel] = ll
+          end
+        end
+        disp << " --> #{title}" if title
+      end
+      m.reply disp, :overlong => :truncate
+    end
+  end
+
    def urls(m, params)
      channel = params[:channel] ? params[:channel] : m.target
      max = params[:limit].to_i
@@ -441,9 +201,7 @@ class UrlPlugin < Plugin
      if list.empty?
        m.reply "no urls seen yet for channel #{channel}"
      else
-      list[0..(max-1)].each do |url|
-        m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
-      end
+      reply_urls :msg => m, :channel => channel, :list => list, :max => max
      end
    end
  
@@ -455,17 +213,17 @@ class UrlPlugin < Plugin
      max = 1 if max < 1
      regex = Regexp.new(string, Regexp::IGNORECASE)
      list = @registry[channel].find_all {|url|
-      regex.match(url.url) || regex.match(url.nick)
+      regex.match(url.url) || regex.match(url.nick) ||
+        (@bot.config['url.info_on_list'] && regex.match(url.info))
      }
      if list.empty?
        m.reply "no matches for channel #{channel}"
      else
-      list[0..(max-1)].each do |url|
-        m.reply "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
-      end
+      reply_urls :msg => m, :channel => channel, :list => list, :max => max
      end
    end
  end
+
  plugin = UrlPlugin.new
  plugin.map 'urls search :channel :limit :string', :action => 'search',
                            :defaults => {:limit => 4},