data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.titles_only',
  19     :default => false,
  20     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  21   Config.register Config::BooleanValue.new('url.first_par',
  22     :default => false,
  23     :desc => "Also try to get the first paragraph of a web page")
  24   Config.register Config::BooleanValue.new('url.info_on_list',
  25     :default => false,
  26     :desc => "Show link info when listing/searching for urls")
  27   Config.register Config::ArrayValue.new('url.no_info_hosts',
  28     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  29     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  30     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  31   Config.register Config::ArrayValue.new('url.only_on_channels',
  32     :desc => "Show link info only on these channels",
  33     :default => [])
  34   Config.register Config::ArrayValue.new('url.ignore',
  35     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  36     :default => [])
  37
  38   def initialize
  39     super
  40     @registry.set_default(Array.new)
  41     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  42       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  43     end
  44     reset_no_info_hosts
  45     self.filter_group = :htmlinfo
  46     load_filters
  47   end
  48
  49   def reset_no_info_hosts
  50     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  51     debug "no info hosts regexp set to #{@no_info_hosts}"
  52   end
  53
  54   def help(plugin, topic="")
  55     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  56   end
  57
  58   def get_title_from_html(pagedata)
  59     return pagedata.ircify_html_title
  60   end
  61
  62   def get_title_for_url(uri_str, opts = {})
  63
  64     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  65     return if url.scheme !~ /https?/
  66
  67     # also check the ip, the canonical name and the aliases
  68     begin
  69       checks = TCPSocket.gethostbyname(url.host)
  70       checks.delete_at(-2)
  71     rescue => e
  72       return "Unable to retrieve info for #{url.host}: #{e.message}"
  73     end
  74
  75     checks << url.host
  76     checks.flatten!
  77
  78     unless checks.grep(@no_info_hosts).empty?
  79       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  80     end
  81
  82     logopts = opts.dup
  83
  84     title = nil
  85     extra = []
  86
  87     begin
  88       debug "+ getting info for #{url.request_uri}"
  89       info = @bot.filter(:htmlinfo, url)
  90       debug info
  91       logopts[:htmlinfo] = info
  92       resp = info[:headers]
  93
  94       logopts[:title] = title = info[:title]
  95
  96       if info[:content]
  97         logopts[:extra] = info[:content]
  98         extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
  99       else
 100         logopts[:extra] = String.new
 101         logopts[:extra] << "Content Type: #{resp['content-type']}"
 102         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 103         if enc = resp['content-encoding']
 104           logopts[:extra] << ", encoding: #{enc}"
 105           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 106         end
 107
 108         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 109         if size
 110           logopts[:extra] << ", size: #{size} bytes"
 111           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 112         end
 113       end
 114     rescue Exception => e
 115       case e
 116       when UrlLinkError
 117         raise e
 118       else
 119         error e
 120         raise "connecting to site/processing information (#{e.message})"
 121       end
 122     end
 123
 124     call_event(:url_added, url.to_s, logopts)
 125     if title
 126       extra.unshift("#{Bold}title#{Bold}: #{title}")
 127     end
 128     return extra.join(", ") if title or not @bot.config['url.titles_only']
 129   end
 130
 131   def handle_urls(m, params={})
 132     opts = {
 133       :display_info => @bot.config['url.display_link_info'],
 134       :channels => @bot.config['url.only_on_channels'],
 135       :ignore => @bot.config['url.ignore']
 136     }.merge params
 137     urls = opts[:urls]
 138     display_info= opts[:display_info]
 139     channels = opts[:channels]
 140     ignore = opts[:ignore]
 141
 142     unless channels.empty?
 143       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 144     end
 145
 146     ignore.each { |u| return if m.source.matches?(u) }
 147
 148     return if urls.empty?
 149     debug "found urls #{urls.inspect}"
 150     list = m.public? ? @registry[m.target] : nil
 151     debug "display link info: #{display_info}"
 152     urls_displayed = 0
 153     urls.each do |urlstr|
 154       debug "working on #{urlstr}"
 155       next unless urlstr =~ /^https?:\/\/./
 156       title = nil
 157       debug "Getting title for #{urlstr}..."
 158       reply = nil
 159       begin
 160         title = get_title_for_url(urlstr,
 161                                   :always_reply => m.address?,
 162                                   :nick => m.source.nick,
 163                                   :channel => m.channel,
 164                                   :ircline => m.message)
 165         debug "Title #{title ? '' : 'not '} found"
 166         reply = "#{LINK_INFO} #{title}" if title
 167       rescue => e
 168         debug e
 169         # we might get a 404 because of trailing punctuation, so we try again
 170         # with the last character stripped. this might generate invalid URIs
 171         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 172         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 173           # chop off last non-word character from the unescaped version of
 174           # the URL, and retry if we still have enough string to look like a
 175           # minimal URL
 176           unescaped = URI.unescape(urlstr)
 177           debug "Unescaped: #{unescaped}"
 178           if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
 179             urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
 180             retry
 181           else
 182             debug "Not retrying #{unescaped}"
 183           end
 184         end
 185         reply = "Error #{e.message}"
 186       end
 187
 188       if display_info > urls_displayed
 189         if reply
 190           m.reply reply, :overlong => :truncate, :to => :public,
 191             :nick => (m.address? ? :auto : false)
 192           urls_displayed += 1
 193         end
 194       end
 195
 196       next unless list
 197
 198       # check to see if this url is already listed
 199       next if list.find {|u| u.url == urlstr }
 200
 201       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 202       debug "#{list.length} urls so far"
 203       list.pop if list.length > @bot.config['url.max_urls']
 204       debug "storing url #{url.url}"
 205       list.unshift url
 206       debug "#{list.length} urls now"
 207     end
 208     @registry[m.target] = list
 209   end
 210
 211   def info(m, params)
 212     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 213     urls = URI.extract(escaped)
 214     Thread.new do
 215       handle_urls(m,
 216                   :urls => urls,
 217                   :display_info => params[:urls].length,
 218                   :channels => [])
 219     end
 220   end
 221
 222   def message(m)
 223     return if m.address?
 224
 225     escaped = URI.escape(m.message, OUR_UNSAFE)
 226     urls = URI.extract(escaped, ['http', 'https'])
 227     return if urls.empty?
 228     Thread.new { handle_urls(m, :urls => urls) }
 229   end
 230
 231   def reply_urls(opts={})
 232     list = opts[:list]
 233     max = opts[:max]
 234     channel = opts[:channel]
 235     m = opts[:msg]
 236     return unless list and max and m
 237     list[0..(max-1)].each do |url|
 238       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 239       if @bot.config['url.info_on_list']
 240         title = url.info ||
 241           get_title_for_url(url.url,
 242                             :nick => url.nick, :channel => channel) rescue nil
 243         # If the url info was missing and we now have some, try to upgrade it
 244         if channel and title and not url.info
 245           ll = @registry[channel]
 246           debug ll
 247           if el = ll.find { |u| u.url == url.url }
 248             el.info = title
 249             @registry[channel] = ll
 250           end
 251         end
 252         disp << " --> #{title}" if title
 253       end
 254       m.reply disp, :overlong => :truncate
 255     end
 256   end
 257
 258   def urls(m, params)
 259     channel = params[:channel] ? params[:channel] : m.target
 260     max = params[:limit].to_i
 261     max = 10 if max > 10
 262     max = 1 if max < 1
 263     list = @registry[channel]
 264     if list.empty?
 265       m.reply "no urls seen yet for channel #{channel}"
 266     else
 267       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 268     end
 269   end
 270
 271   def search(m, params)
 272     channel = params[:channel] ? params[:channel] : m.target
 273     max = params[:limit].to_i
 274     string = params[:string]
 275     max = 10 if max > 10
 276     max = 1 if max < 1
 277     regex = Regexp.new(string, Regexp::IGNORECASE)
 278     list = @registry[channel].find_all {|url|
 279       regex.match(url.url) || regex.match(url.nick) ||
 280         (@bot.config['url.info_on_list'] && regex.match(url.info))
 281     }
 282     if list.empty?
 283       m.reply "no matches for channel #{channel}"
 284     else
 285       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 286     end
 287   end
 288 end
 289
 290 plugin = UrlPlugin.new
 291 plugin.map 'urls info *urls', :action => 'info'
 292 plugin.map 'url info *urls', :action => 'info'
 293 plugin.map 'urls search :channel :limit :string', :action => 'search',
 294                           :defaults => {:limit => 4},
 295                           :requirements => {:limit => /^\d+$/},
 296                           :public => false
 297 plugin.map 'urls search :limit :string', :action => 'search',
 298                           :defaults => {:limit => 4},
 299                           :requirements => {:limit => /^\d+$/},
 300                           :private => false
 301 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 302                           :requirements => {:limit => /^\d+$/},
 303                           :public => false
 304 plugin.map 'urls :limit', :defaults => {:limit => 4},
 305                           :requirements => {:limit => /^\d+$/},
 306                           :private => false