data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.titles_only',
  19     :default => false,
  20     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  21   Config.register Config::BooleanValue.new('url.first_par',
  22     :default => false,
  23     :desc => "Also try to get the first paragraph of a web page")
  24   Config.register Config::BooleanValue.new('url.info_on_list',
  25     :default => false,
  26     :desc => "Show link info when listing/searching for urls")
  27   Config.register Config::ArrayValue.new('url.no_info_hosts',
  28     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  29     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  30     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  31   Config.register Config::ArrayValue.new('url.only_on_channels',
  32     :desc => "Show link info only on these channels",
  33     :default => [])
  34   Config.register Config::ArrayValue.new('url.ignore',
  35     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  36     :default => [])
  37
  38   def initialize
  39     super
  40     @registry.set_default(Array.new)
  41     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  42       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  43     end
  44     reset_no_info_hosts
  45     self.filter_group = :htmlinfo
  46     load_filters
  47   end
  48
  49   def reset_no_info_hosts
  50     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  51     debug "no info hosts regexp set to #{@no_info_hosts}"
  52   end
  53
  54   def help(plugin, topic="")
  55     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  56   end
  57
  58   def get_title_from_html(pagedata)
  59     return pagedata.ircify_html_title
  60   end
  61
  62   def get_title_for_url(uri_str, opts = {})
  63
  64     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  65     return if url.scheme !~ /https?/
  66
  67     # also check the ip, the canonical name and the aliases
  68     begin
  69       checks = TCPSocket.gethostbyname(url.host)
  70       checks.delete_at(-2)
  71     rescue => e
  72       return "Unable to retrieve info for #{url.host}: #{e.message}"
  73     end
  74
  75     checks << url.host
  76     checks.flatten!
  77
  78     unless checks.grep(@no_info_hosts).empty?
  79       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  80     end
  81
  82     logopts = opts.dup
  83
  84     title = nil
  85     extra = []
  86
  87     begin
  88       debug "+ getting info for #{url.request_uri}"
  89       info = @bot.filter(:htmlinfo, url)
  90       debug info
  91       resp = info[:headers]
  92
  93       logopts[:title] = title = info[:title]
  94
  95       if info[:content]
  96         logopts[:extra] = info[:content]
  97         extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
  98       else
  99         logopts[:extra] = String.new
 100         logopts[:extra] << "Content Type: #{resp['content-type']}"
 101         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 102         if enc = resp['content-encoding']
 103           logopts[:extra] << ", encoding: #{enc}"
 104           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 105         end
 106
 107         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 108         if size
 109           logopts[:extra] << ", size: #{size} bytes"
 110           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 111         end
 112       end
 113     rescue Exception => e
 114       case e
 115       when UrlLinkError
 116         raise e
 117       else
 118         error e
 119         raise "connecting to site/processing information (#{e.message})"
 120       end
 121     end
 122
 123     call_event(:url_added, url.to_s, logopts)
 124     if title
 125       extra.unshift("#{Bold}title#{Bold}: #{title}")
 126     end
 127     return extra.join(", ") if title or not @bot.config['url.titles_only']
 128   end
 129
 130   def handle_urls(m, params={})
 131     opts = {
 132       :display_info => @bot.config['url.display_link_info'],
 133       :channels => @bot.config['url.only_on_channels'],
 134       :ignore => @bot.config['url.ignore']
 135     }.merge params
 136     urls = opts[:urls]
 137     display_info= opts[:display_info]
 138     channels = opts[:channels]
 139     ignore = opts[:ignore]
 140
 141     unless channels.empty?
 142       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 143     end
 144
 145     ignore.each { |u| return if m.source.matches?(u) }
 146
 147     return if urls.empty?
 148     debug "found urls #{urls.inspect}"
 149     list = m.public? ? @registry[m.target] : nil
 150     debug "display link info: #{display_info}"
 151     urls_displayed = 0
 152     urls.each do |urlstr|
 153       debug "working on #{urlstr}"
 154       next unless urlstr =~ /^https?:\/\/./
 155       title = nil
 156       debug "Getting title for #{urlstr}..."
 157       reply = nil
 158       begin
 159         title = get_title_for_url(urlstr,
 160                                   :always_reply => m.address?,
 161                                   :nick => m.source.nick,
 162                                   :channel => m.channel,
 163                                   :ircline => m.message)
 164         debug "Title #{title ? '' : 'not '} found"
 165         reply = "#{LINK_INFO} #{title}" if title
 166       rescue => e
 167         debug e
 168         # we might get a 404 because of trailing punctuation, so we try again
 169         # with the last character stripped. this might generate invalid URIs
 170         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 171         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 172           # chop off last character, and retry if we still have enough string to
 173           # look like a minimal URL
 174           retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
 175         end
 176         reply = "Error #{e.message}"
 177       end
 178
 179       if display_info > urls_displayed
 180         if reply
 181           m.reply reply, :overlong => :truncate, :to => :public,
 182             :nick => (m.address? ? :auto : false)
 183           urls_displayed += 1
 184         end
 185       end
 186
 187       next unless list
 188
 189       # check to see if this url is already listed
 190       next if list.find {|u| u.url == urlstr }
 191
 192       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 193       debug "#{list.length} urls so far"
 194       list.pop if list.length > @bot.config['url.max_urls']
 195       debug "storing url #{url.url}"
 196       list.unshift url
 197       debug "#{list.length} urls now"
 198     end
 199     @registry[m.target] = list
 200   end
 201
 202   def info(m, params)
 203     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 204     urls = URI.extract(escaped)
 205     Thread.new do
 206       handle_urls(m,
 207                   :urls => urls,
 208                   :display_info => params[:urls].length,
 209                   :channels => [])
 210     end
 211   end
 212
 213   def message(m)
 214     return if m.address?
 215
 216     escaped = URI.escape(m.message, OUR_UNSAFE)
 217     urls = URI.extract(escaped, ['http', 'https'])
 218     return if urls.empty?
 219     Thread.new { handle_urls(m, :urls => urls) }
 220   end
 221
 222   def reply_urls(opts={})
 223     list = opts[:list]
 224     max = opts[:max]
 225     channel = opts[:channel]
 226     m = opts[:msg]
 227     return unless list and max and m
 228     list[0..(max-1)].each do |url|
 229       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 230       if @bot.config['url.info_on_list']
 231         title = url.info ||
 232           get_title_for_url(url.url,
 233                             :nick => url.nick, :channel => channel) rescue nil
 234         # If the url info was missing and we now have some, try to upgrade it
 235         if channel and title and not url.info
 236           ll = @registry[channel]
 237           debug ll
 238           if el = ll.find { |u| u.url == url.url }
 239             el.info = title
 240             @registry[channel] = ll
 241           end
 242         end
 243         disp << " --> #{title}" if title
 244       end
 245       m.reply disp, :overlong => :truncate
 246     end
 247   end
 248
 249   def urls(m, params)
 250     channel = params[:channel] ? params[:channel] : m.target
 251     max = params[:limit].to_i
 252     max = 10 if max > 10
 253     max = 1 if max < 1
 254     list = @registry[channel]
 255     if list.empty?
 256       m.reply "no urls seen yet for channel #{channel}"
 257     else
 258       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 259     end
 260   end
 261
 262   def search(m, params)
 263     channel = params[:channel] ? params[:channel] : m.target
 264     max = params[:limit].to_i
 265     string = params[:string]
 266     max = 10 if max > 10
 267     max = 1 if max < 1
 268     regex = Regexp.new(string, Regexp::IGNORECASE)
 269     list = @registry[channel].find_all {|url|
 270       regex.match(url.url) || regex.match(url.nick) ||
 271         (@bot.config['url.info_on_list'] && regex.match(url.info))
 272     }
 273     if list.empty?
 274       m.reply "no matches for channel #{channel}"
 275     else
 276       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 277     end
 278   end
 279 end
 280
 281 plugin = UrlPlugin.new
 282 plugin.map 'urls info *urls', :action => 'info'
 283 plugin.map 'url info *urls', :action => 'info'
 284 plugin.map 'urls search :channel :limit :string', :action => 'search',
 285                           :defaults => {:limit => 4},
 286                           :requirements => {:limit => /^\d+$/},
 287                           :public => false
 288 plugin.map 'urls search :limit :string', :action => 'search',
 289                           :defaults => {:limit => 4},
 290                           :requirements => {:limit => /^\d+$/},
 291                           :private => false
 292 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 293                           :requirements => {:limit => /^\d+$/},
 294                           :public => false
 295 plugin.map 'urls :limit', :defaults => {:limit => 4},
 296                           :requirements => {:limit => /^\d+$/},
 297                           :private => false