data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class ::UrlLinkError < RuntimeError
   9 end
  10
  11 class UrlPlugin < Plugin
  12   LINK_INFO = "[Link Info]"
  13   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  14
  15   Config.register Config::IntegerValue.new('url.max_urls',
  16     :default => 100, :validate => Proc.new{|v| v > 0},
  17     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  18   Config.register Config::IntegerValue.new('url.display_link_info',
  19     :default => 0,
  20     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  21   Config.register Config::BooleanValue.new('url.titles_only',
  22     :default => false,
  23     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  24   Config.register Config::BooleanValue.new('url.first_par',
  25     :default => false,
  26     :desc => "Also try to get the first paragraph of a web page")
  27   Config.register Config::BooleanValue.new('url.info_on_list',
  28     :default => false,
  29     :desc => "Show link info when listing/searching for urls")
  30   Config.register Config::ArrayValue.new('url.no_info_hosts',
  31     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  32     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  33     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  34
  35
  36   def initialize
  37     super
  38     @registry.set_default(Array.new)
  39     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  40       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  41     end
  42     reset_no_info_hosts
  43   end
  44
  45   def reset_no_info_hosts
  46     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  47     debug "no info hosts regexp set to #{@no_info_hosts}"
  48   end
  49
  50   def help(plugin, topic="")
  51     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  52   end
  53
  54   def get_title_from_html(pagedata)
  55     return pagedata.ircify_html_title
  56   end
  57
  58   def get_title_for_url(uri_str, opts = {})
  59
  60     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  61     return if url.scheme !~ /https?/
  62
  63     if url.host =~ @no_info_hosts
  64       return "Sorry, info retrieval for #{url.host} is disabled"
  65     end
  66
  67     logopts = opts.dup
  68
  69     title = nil
  70     extra = String.new
  71
  72     begin
  73       debug "+ getting #{url.request_uri}"
  74       @bot.httputil.get_response(url) { |resp|
  75         case resp
  76         when Net::HTTPSuccess
  77
  78           debug resp.to_hash
  79
  80           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  81             # The page is text or HTML, so we can try finding a title and, if
  82             # requested, the first par.
  83             #
  84             # We act differently depending on whether we want the first par or
  85             # not: in the first case we download the initial part and the parse
  86             # it; in the second case we only download as much as we need to find
  87             # the title
  88             #
  89             if @bot.config['url.first_par']
  90               partial = resp.partial_body(@bot.config['http.info_bytes'])
  91               logopts[:title] = title = get_title_from_html(partial)
  92               if url.fragment and not url.fragment.empty?
  93                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  94                 partial.sub!(fragreg,'')
  95               end
  96               first_par = Utils.ircify_first_html_par(partial, :strip => title)
  97               unless first_par.empty?
  98                 logopts[:extra] = first_par
  99                 extra << ", #{Bold}text#{Bold}: #{first_par}"
 100               end
 101               call_event(:url_added, url.to_s, logopts)
 102               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
 103             else
 104               resp.partial_body(@bot.config['http.info_bytes']) { |part|
 105                 logopts[:title] = title = get_title_from_html(part)
 106                 call_event(:url_added, url.to_s, logopts)
 107                 return "#{Bold}title#{Bold}: #{title}" if title
 108               }
 109             end
 110           # if nothing was found, provide more basic info, as for non-html pages
 111           else
 112             resp.no_cache = true
 113           end
 114
 115           enc = resp['content-encoding']
 116           logopts[:extra] = String.new
 117           logopts[:extra] << "Content Type: #{resp['content-type']}"
 118           if enc
 119             logopts[:extra] << ", encoding: #{enc}"
 120             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 121           end
 122
 123           unless @bot.config['url.titles_only']
 124             # content doesn't have title, just display info.
 125             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 126             if size
 127               logopts[:extra] << ", size: #{size} bytes"
 128               size = ", #{Bold}size#{Bold}: #{size} bytes"
 129             end
 130             call_event(:url_added, url.to_s, logopts)
 131             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 132           end
 133           call_event(:url_added, url.to_s, logopts)
 134         else
 135           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 136         end
 137       }
 138       return nil
 139     rescue Exception => e
 140       case e
 141       when UrlLinkError
 142         raise e
 143       else
 144         error e
 145         raise "connecting to site/processing information (#{e.message})"
 146       end
 147     end
 148   end
 149
 150   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
 151     return if urls.empty?
 152     debug "found urls #{urls.inspect}"
 153     list = m.public? ? @registry[m.target] : nil
 154     debug "display link info: #{display_info}"
 155     urls_displayed = 0
 156     urls.each do |urlstr|
 157       debug "working on #{urlstr}"
 158       next unless urlstr =~ /^https?:/
 159       title = nil
 160       debug "Getting title for #{urlstr}..."
 161       begin
 162         title = get_title_for_url(urlstr,
 163                                   :nick => m.source.nick,
 164                                   :channel => m.channel,
 165                                   :ircline => m.message)
 166         debug "Title #{title ? '' : 'not '} found"
 167       rescue => e
 168         m.reply "Error #{e.message}"
 169       end
 170
 171       if display_info > urls_displayed
 172         if title
 173           m.reply("#{LINK_INFO} #{title}", :overlong => :truncate)
 174           urls_displayed += 1
 175         end
 176       end
 177
 178       next unless list
 179
 180       # check to see if this url is already listed
 181       next if list.find {|u| u.url == urlstr }
 182
 183       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 184       debug "#{list.length} urls so far"
 185       list.pop if list.length > @bot.config['url.max_urls']
 186       debug "storing url #{url.url}"
 187       list.unshift url
 188       debug "#{list.length} urls now"
 189     end
 190     @registry[m.target] = list
 191   end
 192
 193   def info(m, params)
 194     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 195     urls = URI.extract(escaped)
 196     Thread.new { handle_urls(m, urls, params[:urls].length) }
 197   end
 198
 199   def listen(m)
 200     return unless m.kind_of?(PrivMessage)
 201     return if m.address?
 202
 203     escaped = URI.escape(m.message, OUR_UNSAFE)
 204     urls = URI.extract(escaped)
 205     Thread.new { handle_urls(m, urls) }
 206   end
 207
 208   def reply_urls(opts={})
 209     list = opts[:list]
 210     max = opts[:max]
 211     channel = opts[:channel]
 212     m = opts[:msg]
 213     return unless list and max and m
 214     list[0..(max-1)].each do |url|
 215       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 216       if @bot.config['url.info_on_list']
 217         title = url.info ||
 218           get_title_for_url(url.url,
 219                             :nick => url.nick, :channel => channel) rescue nil
 220         # If the url info was missing and we now have some, try to upgrade it
 221         if channel and title and not url.info
 222           ll = @registry[channel]
 223           debug ll
 224           if el = ll.find { |u| u.url == url.url }
 225             el.info = title
 226             @registry[channel] = ll
 227           end
 228         end
 229         disp << " --> #{title}" if title
 230       end
 231       m.reply disp, :overlong => :truncate
 232     end
 233   end
 234
 235   def urls(m, params)
 236     channel = params[:channel] ? params[:channel] : m.target
 237     max = params[:limit].to_i
 238     max = 10 if max > 10
 239     max = 1 if max < 1
 240     list = @registry[channel]
 241     if list.empty?
 242       m.reply "no urls seen yet for channel #{channel}"
 243     else
 244       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 245     end
 246   end
 247
 248   def search(m, params)
 249     channel = params[:channel] ? params[:channel] : m.target
 250     max = params[:limit].to_i
 251     string = params[:string]
 252     max = 10 if max > 10
 253     max = 1 if max < 1
 254     regex = Regexp.new(string, Regexp::IGNORECASE)
 255     list = @registry[channel].find_all {|url|
 256       regex.match(url.url) || regex.match(url.nick) ||
 257         (@bot.config['url.info_on_list'] && regex.match(url.info))
 258     }
 259     if list.empty?
 260       m.reply "no matches for channel #{channel}"
 261     else
 262       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 263     end
 264   end
 265 end
 266
 267 plugin = UrlPlugin.new
 268 plugin.map 'urls info *urls', :action => 'info'
 269 plugin.map 'url info *urls', :action => 'info'
 270 plugin.map 'urls search :channel :limit :string', :action => 'search',
 271                           :defaults => {:limit => 4},
 272                           :requirements => {:limit => /^\d+$/},
 273                           :public => false
 274 plugin.map 'urls search :limit :string', :action => 'search',
 275                           :defaults => {:limit => 4},
 276                           :requirements => {:limit => /^\d+$/},
 277                           :private => false
 278 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 279                           :requirements => {:limit => /^\d+$/},
 280                           :public => false
 281 plugin.map 'urls :limit', :defaults => {:limit => 4},
 282                           :requirements => {:limit => /^\d+$/},
 283                           :private => false