data/rbot/plugins/url.rb

   1 define_structure :Url, :channel, :nick, :time, :url, :info
   2
   3 class ::UrlLinkError < RuntimeError
   4 end
   5
   6 class UrlPlugin < Plugin
   7   TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
   8   LINK_INFO = "[Link Info]"
   9   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  10
  11   BotConfig.register BotConfigIntegerValue.new('url.max_urls',
  12     :default => 100, :validate => Proc.new{|v| v > 0},
  13     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  14   BotConfig.register BotConfigIntegerValue.new('url.display_link_info',
  15     :default => 0,
  16     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  17   BotConfig.register BotConfigBooleanValue.new('url.titles_only',
  18     :default => false,
  19     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  20   BotConfig.register BotConfigBooleanValue.new('url.first_par',
  21     :default => false,
  22     :desc => "Also try to get the first paragraph of a web page")
  23   BotConfig.register BotConfigBooleanValue.new('url.info_on_list',
  24     :default => false,
  25     :desc => "Show link info when listing/searching for urls")
  26
  27
  28   def initialize
  29     super
  30     @registry.set_default(Array.new)
  31     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  32       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  33     end
  34   end
  35
  36   def help(plugin, topic="")
  37     "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  38   end
  39
  40   def get_title_from_html(pagedata)
  41     return unless TITLE_RE.match(pagedata)
  42     $1.ircify_html
  43   end
  44
  45   def get_title_for_url(uri_str, nick = nil, channel = nil)
  46
  47     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  48     return if url.scheme !~ /https?/
  49
  50     logopts = Hash.new
  51     logopts[:nick] = nick if nick
  52     logopts[:channel] = channel if channel
  53
  54     title = nil
  55     extra = String.new
  56
  57     begin
  58       debug "+ getting #{url.request_uri}"
  59       @bot.httputil.get_response(url) { |resp|
  60         case resp
  61         when Net::HTTPSuccess
  62
  63           debug resp.to_hash
  64
  65           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  66             # The page is text or HTML, so we can try finding a title and, if
  67             # requested, the first par.
  68             #
  69             # We act differently depending on whether we want the first par or
  70             # not: in the first case we download the initial part and the parse
  71             # it; in the second case we only download as much as we need to find
  72             # the title
  73             #
  74             if @bot.config['url.first_par']
  75               partial = resp.partial_body(@bot.config['http.info_bytes'])
  76               logopts[:title] = title = get_title_from_html(partial)
  77               if url.fragment and not url.fragment.empty?
  78                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  79                 partial.sub!(fragreg,'')
  80               end
  81               first_par = Utils.ircify_first_html_par(partial, :strip => title)
  82               unless first_par.empty?
  83                 logopts[:extra] = first_par
  84                 extra << ", #{Bold}text#{Bold}: #{first_par}"
  85               end
  86               call_event(:url_added, url.to_s, logopts)
  87               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
  88             else
  89               resp.partial_body(@bot.config['http.info_bytes']) { |part|
  90                 logopts[:title] = title = get_title_from_html(part)
  91                 call_event(:url_added, url.to_s, logopts)
  92                 return "#{Bold}title#{Bold}: #{title}" if title
  93               }
  94             end
  95           # if nothing was found, provide more basic info, as for non-html pages
  96           else
  97             resp.no_cache = true
  98           end
  99
 100           enc = resp['content-encoding']
 101           logopts[:extra] = String.new
 102           logopts[:extra] << "Content Type: #{resp['content-type']}"
 103           if enc
 104             logopts[:extra] << ", encoding: #{enc}"
 105             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 106           end
 107
 108           unless @bot.config['url.titles_only']
 109             # content doesn't have title, just display info.
 110             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 111             if size
 112               logopts[:extra] << ", size: #{size} bytes"
 113               size = ", #{Bold}size#{Bold}: #{size} bytes"
 114             end
 115             call_event(:url_added, url.to_s, logopts)
 116             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 117           end
 118           call_event(:url_added, url.to_s, logopts)
 119         else
 120           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 121         end
 122       }
 123       return nil
 124     rescue Exception => e
 125       case e
 126       when UrlLinkError
 127         raise e
 128       else
 129         error e
 130         raise "connecting to site/processing information (#{e.message})"
 131       end
 132     end
 133   end
 134
 135   def listen(m)
 136     return unless m.kind_of?(PrivMessage)
 137     return if m.address?
 138
 139     escaped = URI.escape(m.message, OUR_UNSAFE)
 140     urls = URI.extract(escaped)
 141     return if urls.empty?
 142     debug "found urls #{urls.inspect}"
 143     list = @registry[m.target]
 144     urls_displayed = 0
 145     urls.each { |urlstr|
 146       debug "working on #{urlstr}"
 147       next unless urlstr =~ /^https?:/
 148       title = nil
 149       debug "display link info: #{@bot.config['url.display_link_info']}"
 150       if @bot.config['url.display_link_info'] > urls_displayed
 151         urls_displayed += 1
 152         Thread.start do
 153           debug "Getting title for #{urlstr}..."
 154           begin
 155             title = get_title_for_url urlstr, m.source.nick, m.channel
 156             if title
 157               m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
 158               debug "Title found!"
 159             else
 160               debug "Title not found!"
 161             end
 162           rescue => e
 163             m.reply "Error #{e.message}"
 164           end
 165         end
 166       end
 167
 168       # check to see if this url is already listed
 169       next if list.find {|u| u.url == urlstr }
 170
 171       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 172       debug "#{list.length} urls so far"
 173       if list.length > @bot.config['url.max_urls']
 174         list.pop
 175       end
 176       debug "storing url #{url.url}"
 177       list.unshift url
 178       debug "#{list.length} urls now"
 179     }
 180     @registry[m.target] = list
 181   end
 182
 183   def reply_urls(opts={})
 184     list = opts[:list]
 185     max = opts[:max]
 186     channel = opts[:channel]
 187     m = opts[:msg]
 188     return unless list and max and m
 189     list[0..(max-1)].each do |url|
 190       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 191       if @bot.config['url.info_on_list']
 192         title = url.info || get_title_for_url(url.url, url.nick, channel) rescue nil
 193         # If the url info was missing and we now have some, try to upgrade it
 194         if channel and title and not url.info
 195           ll = @registry[channel]
 196           debug ll
 197           if el = ll.find { |u| u.url == url.url }
 198             el.info = title
 199             @registry[channel] = ll
 200           end
 201         end
 202         disp << " --> #{title}" if title
 203       end
 204       m.reply disp, :overlong => :truncate
 205     end
 206   end
 207
 208   def urls(m, params)
 209     channel = params[:channel] ? params[:channel] : m.target
 210     max = params[:limit].to_i
 211     max = 10 if max > 10
 212     max = 1 if max < 1
 213     list = @registry[channel]
 214     if list.empty?
 215       m.reply "no urls seen yet for channel #{channel}"
 216     else
 217       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 218     end
 219   end
 220
 221   def search(m, params)
 222     channel = params[:channel] ? params[:channel] : m.target
 223     max = params[:limit].to_i
 224     string = params[:string]
 225     max = 10 if max > 10
 226     max = 1 if max < 1
 227     regex = Regexp.new(string, Regexp::IGNORECASE)
 228     list = @registry[channel].find_all {|url|
 229       regex.match(url.url) || regex.match(url.nick) ||
 230         (@bot.config['url.info_on_list'] && regex.match(url.info))
 231     }
 232     if list.empty?
 233       m.reply "no matches for channel #{channel}"
 234     else
 235       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 236     end
 237   end
 238 end
 239
 240 plugin = UrlPlugin.new
 241 plugin.map 'urls search :channel :limit :string', :action => 'search',
 242                           :defaults => {:limit => 4},
 243                           :requirements => {:limit => /^\d+$/},
 244                           :public => false
 245 plugin.map 'urls search :limit :string', :action => 'search',
 246                           :defaults => {:limit => 4},
 247                           :requirements => {:limit => /^\d+$/},
 248                           :private => false
 249 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 250                           :requirements => {:limit => /^\d+$/},
 251                           :public => false
 252 plugin.map 'urls :limit', :defaults => {:limit => 4},
 253                           :requirements => {:limit => /^\d+$/},
 254                           :private => false