data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class ::UrlLinkError < RuntimeError
   9 end
  10
  11 class UrlPlugin < Plugin
  12   TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
  13   LINK_INFO = "[Link Info]"
  14   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  15
  16   Config.register Config::IntegerValue.new('url.max_urls',
  17     :default => 100, :validate => Proc.new{|v| v > 0},
  18     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  19   Config.register Config::IntegerValue.new('url.display_link_info',
  20     :default => 0,
  21     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  22   Config.register Config::BooleanValue.new('url.titles_only',
  23     :default => false,
  24     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  25   Config.register Config::BooleanValue.new('url.first_par',
  26     :default => false,
  27     :desc => "Also try to get the first paragraph of a web page")
  28   Config.register Config::BooleanValue.new('url.info_on_list',
  29     :default => false,
  30     :desc => "Show link info when listing/searching for urls")
  31   Config.register Config::ArrayValue.new('url.no_info_hosts',
  32     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  33     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  34     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  35
  36
  37   def initialize
  38     super
  39     @registry.set_default(Array.new)
  40     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  41       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  42     end
  43     reset_no_info_hosts
  44   end
  45
  46   def reset_no_info_hosts
  47     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  48     debug "no info hosts regexp set to #{@no_info_hosts}"
  49   end
  50
  51   def help(plugin, topic="")
  52     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  53   end
  54
  55   def get_title_from_html(pagedata)
  56     return unless TITLE_RE.match(pagedata)
  57     $1.ircify_html
  58   end
  59
  60   def get_title_for_url(uri_str, opts = {})
  61
  62     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  63     return if url.scheme !~ /https?/
  64
  65     if url.host =~ @no_info_hosts
  66       return "Sorry, info retrieval for #{url.host} is disabled"
  67     end
  68
  69     logopts = opts.dup
  70
  71     title = nil
  72     extra = String.new
  73
  74     begin
  75       debug "+ getting #{url.request_uri}"
  76       @bot.httputil.get_response(url) { |resp|
  77         case resp
  78         when Net::HTTPSuccess
  79
  80           debug resp.to_hash
  81
  82           if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
  83             # The page is text or HTML, so we can try finding a title and, if
  84             # requested, the first par.
  85             #
  86             # We act differently depending on whether we want the first par or
  87             # not: in the first case we download the initial part and the parse
  88             # it; in the second case we only download as much as we need to find
  89             # the title
  90             #
  91             if @bot.config['url.first_par']
  92               partial = resp.partial_body(@bot.config['http.info_bytes'])
  93               logopts[:title] = title = get_title_from_html(partial)
  94               if url.fragment and not url.fragment.empty?
  95                 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
  96                 partial.sub!(fragreg,'')
  97               end
  98               first_par = Utils.ircify_first_html_par(partial, :strip => title)
  99               unless first_par.empty?
 100                 logopts[:extra] = first_par
 101                 extra << ", #{Bold}text#{Bold}: #{first_par}"
 102               end
 103               call_event(:url_added, url.to_s, logopts)
 104               return "#{Bold}title#{Bold}: #{title}#{extra}" if title
 105             else
 106               resp.partial_body(@bot.config['http.info_bytes']) { |part|
 107                 logopts[:title] = title = get_title_from_html(part)
 108                 call_event(:url_added, url.to_s, logopts)
 109                 return "#{Bold}title#{Bold}: #{title}" if title
 110               }
 111             end
 112           # if nothing was found, provide more basic info, as for non-html pages
 113           else
 114             resp.no_cache = true
 115           end
 116
 117           enc = resp['content-encoding']
 118           logopts[:extra] = String.new
 119           logopts[:extra] << "Content Type: #{resp['content-type']}"
 120           if enc
 121             logopts[:extra] << ", encoding: #{enc}"
 122             extra << ", #{Bold}encoding#{Bold}: #{enc}"
 123           end
 124
 125           unless @bot.config['url.titles_only']
 126             # content doesn't have title, just display info.
 127             size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 128             if size
 129               logopts[:extra] << ", size: #{size} bytes"
 130               size = ", #{Bold}size#{Bold}: #{size} bytes"
 131             end
 132             call_event(:url_added, url.to_s, logopts)
 133             return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
 134           end
 135           call_event(:url_added, url.to_s, logopts)
 136         else
 137           raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
 138         end
 139       }
 140       return nil
 141     rescue Exception => e
 142       case e
 143       when UrlLinkError
 144         raise e
 145       else
 146         error e
 147         raise "connecting to site/processing information (#{e.message})"
 148       end
 149     end
 150   end
 151
 152   def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
 153     return if urls.empty?
 154     debug "found urls #{urls.inspect}"
 155     if m.public?
 156       list = @registry[m.target]
 157     else
 158       list = nil
 159     end
 160     urls_displayed = 0
 161     urls.each { |urlstr|
 162       debug "working on #{urlstr}"
 163       next unless urlstr =~ /^https?:/
 164       title = nil
 165       debug "display link info: #{display_info}"
 166       if display_info > urls_displayed
 167         urls_displayed += 1
 168         Thread.start do
 169           debug "Getting title for #{urlstr}..."
 170           begin
 171             title = get_title_for_url(urlstr,
 172                                       :nick => m.source.nick,
 173                                       :channel => m.channel,
 174                                       :ircline => m.message)
 175             if title
 176               m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
 177               debug "Title found!"
 178             else
 179               debug "Title not found!"
 180             end
 181           rescue => e
 182             m.reply "Error #{e.message}"
 183           end
 184         end
 185       end
 186
 187       next unless list
 188
 189       # check to see if this url is already listed
 190       next if list.find {|u| u.url == urlstr }
 191
 192       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 193       debug "#{list.length} urls so far"
 194       if list.length > @bot.config['url.max_urls']
 195         list.pop
 196       end
 197       debug "storing url #{url.url}"
 198       list.unshift url
 199       debug "#{list.length} urls now"
 200     }
 201     @registry[m.target] = list
 202   end
 203
 204   def info(m, params)
 205     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 206     urls = URI.extract(escaped)
 207     handle_urls(m, urls, params[:urls].length)
 208   end
 209
 210   def listen(m)
 211     return unless m.kind_of?(PrivMessage)
 212     return if m.address?
 213
 214     escaped = URI.escape(m.message, OUR_UNSAFE)
 215     urls = URI.extract(escaped)
 216     handle_urls(m, urls)
 217   end
 218
 219   def reply_urls(opts={})
 220     list = opts[:list]
 221     max = opts[:max]
 222     channel = opts[:channel]
 223     m = opts[:msg]
 224     return unless list and max and m
 225     list[0..(max-1)].each do |url|
 226       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 227       if @bot.config['url.info_on_list']
 228         title = url.info ||
 229           get_title_for_url(url.url,
 230                             :nick => url.nick, :channel => channel) rescue nil
 231         # If the url info was missing and we now have some, try to upgrade it
 232         if channel and title and not url.info
 233           ll = @registry[channel]
 234           debug ll
 235           if el = ll.find { |u| u.url == url.url }
 236             el.info = title
 237             @registry[channel] = ll
 238           end
 239         end
 240         disp << " --> #{title}" if title
 241       end
 242       m.reply disp, :overlong => :truncate
 243     end
 244   end
 245
 246   def urls(m, params)
 247     channel = params[:channel] ? params[:channel] : m.target
 248     max = params[:limit].to_i
 249     max = 10 if max > 10
 250     max = 1 if max < 1
 251     list = @registry[channel]
 252     if list.empty?
 253       m.reply "no urls seen yet for channel #{channel}"
 254     else
 255       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 256     end
 257   end
 258
 259   def search(m, params)
 260     channel = params[:channel] ? params[:channel] : m.target
 261     max = params[:limit].to_i
 262     string = params[:string]
 263     max = 10 if max > 10
 264     max = 1 if max < 1
 265     regex = Regexp.new(string, Regexp::IGNORECASE)
 266     list = @registry[channel].find_all {|url|
 267       regex.match(url.url) || regex.match(url.nick) ||
 268         (@bot.config['url.info_on_list'] && regex.match(url.info))
 269     }
 270     if list.empty?
 271       m.reply "no matches for channel #{channel}"
 272     else
 273       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 274     end
 275   end
 276 end
 277
 278 plugin = UrlPlugin.new
 279 plugin.map 'urls info *urls', :action => 'info'
 280 plugin.map 'url info *urls', :action => 'info'
 281 plugin.map 'urls search :channel :limit :string', :action => 'search',
 282                           :defaults => {:limit => 4},
 283                           :requirements => {:limit => /^\d+$/},
 284                           :public => false
 285 plugin.map 'urls search :limit :string', :action => 'search',
 286                           :defaults => {:limit => 4},
 287                           :requirements => {:limit => /^\d+$/},
 288                           :private => false
 289 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 290                           :requirements => {:limit => /^\d+$/},
 291                           :public => false
 292 plugin.map 'urls :limit', :defaults => {:limit => 4},
 293                           :requirements => {:limit => /^\d+$/},
 294                           :private => false