data/rbot/plugins/url.rb

   1 #-- vim:sw=2:et
   2 #++
   3 #
   4 # :title: Url plugin
   5
   6 define_structure :Url, :channel, :nick, :time, :url, :info
   7
   8 class UrlPlugin < Plugin
   9   LINK_INFO = "[Link Info]"
  10   OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  11
  12   Config.register Config::IntegerValue.new('url.max_urls',
  13     :default => 100, :validate => Proc.new{|v| v > 0},
  14     :desc => "Maximum number of urls to store. New urls replace oldest ones.")
  15   Config.register Config::IntegerValue.new('url.display_link_info',
  16     :default => 0,
  17     :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
  18   Config.register Config::BooleanValue.new('url.titles_only',
  19     :default => false,
  20     :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
  21   Config.register Config::BooleanValue.new('url.first_par',
  22     :default => false,
  23     :desc => "Also try to get the first paragraph of a web page")
  24   Config.register Config::BooleanValue.new('url.info_on_list',
  25     :default => false,
  26     :desc => "Show link info when listing/searching for urls")
  27   Config.register Config::ArrayValue.new('url.no_info_hosts',
  28     :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
  29     :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
  30     :desc => "A list of regular expressions matching hosts for which no info should be provided")
  31   Config.register Config::ArrayValue.new('url.only_on_channels',
  32     :desc => "Show link info only on these channels",
  33     :default => [])
  34   Config.register Config::ArrayValue.new('url.ignore',
  35     :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
  36     :default => [])
  37
  38   def initialize
  39     super
  40     @registry.set_default(Array.new)
  41     unless @bot.config['url.display_link_info'].kind_of?(Integer)
  42       @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
  43     end
  44     reset_no_info_hosts
  45     self.filter_group = :htmlinfo
  46     load_filters
  47   end
  48
  49   def reset_no_info_hosts
  50     @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
  51     debug "no info hosts regexp set to #{@no_info_hosts}"
  52   end
  53
  54   def help(plugin, topic="")
  55     "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
  56   end
  57
  58   def get_title_from_html(pagedata)
  59     return pagedata.ircify_html_title
  60   end
  61
  62   def get_title_for_url(uri_str, opts = {})
  63
  64     url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
  65     return if url.scheme !~ /https?/
  66
  67     # also check the ip, the canonical name and the aliases
  68     begin
  69       checks = TCPSocket.gethostbyname(url.host)
  70       checks.delete_at(-2)
  71     rescue => e
  72       return "Unable to retrieve info for #{url.host}: #{e.message}"
  73     end
  74
  75     checks << url.host
  76     checks.flatten!
  77
  78     unless checks.grep(@no_info_hosts).empty?
  79       return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
  80     end
  81
  82     logopts = opts.dup
  83
  84     title = nil
  85     extra = []
  86
  87     begin
  88       debug "+ getting info for #{url.request_uri}"
  89       info = @bot.filter(:htmlinfo, url)
  90       debug info
  91       resp = info[:headers]
  92
  93       logopts[:title] = title = info[:title]
  94
  95       if info[:content]
  96         logopts[:extra] = info[:content]
  97         extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
  98       else
  99         logopts[:extra] = String.new
 100         logopts[:extra] << "Content Type: #{resp['content-type']}"
 101         extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
 102         if enc = resp['content-encoding']
 103           logopts[:extra] << ", encoding: #{enc}"
 104           extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
 105         end
 106
 107         size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
 108         if size
 109           logopts[:extra] << ", size: #{size} bytes"
 110           extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
 111         end
 112       end
 113     rescue Exception => e
 114       case e
 115       when UrlLinkError
 116         raise e
 117       else
 118         error e
 119         raise "connecting to site/processing information (#{e.message})"
 120       end
 121     end
 122
 123     call_event(:url_added, url.to_s, logopts)
 124     if title
 125       extra.unshift("#{Bold}title#{Bold}: #{title}")
 126     end
 127     return extra.join(", ") if title or not @bot.config['url.titles_only']
 128   end
 129
 130   def handle_urls(m, params={})
 131     opts = {
 132       :display_info => @bot.config['url.display_link_info'],
 133       :channels => @bot.config['url.only_on_channels'],
 134       :ignore => @bot.config['url.ignore']
 135     }.merge params
 136     urls = opts[:urls]
 137     display_info= opts[:display_info]
 138     channels = opts[:channels]
 139     ignore = opts[:ignore]
 140
 141     unless channels.empty?
 142       return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
 143     end
 144
 145     ignore.each { |u| return if m.source.matches?(u) }
 146
 147     return if urls.empty?
 148     debug "found urls #{urls.inspect}"
 149     list = m.public? ? @registry[m.target] : nil
 150     debug "display link info: #{display_info}"
 151     urls_displayed = 0
 152     urls.each do |urlstr|
 153       debug "working on #{urlstr}"
 154       next unless urlstr =~ /^https?:\/\/./
 155       title = nil
 156       debug "Getting title for #{urlstr}..."
 157       reply = nil
 158       begin
 159         title = get_title_for_url(urlstr,
 160                                   :always_reply => m.address?,
 161                                   :nick => m.source.nick,
 162                                   :channel => m.channel,
 163                                   :ircline => m.message)
 164         debug "Title #{title ? '' : 'not '} found"
 165         reply = "#{LINK_INFO} #{title}" if title
 166       rescue => e
 167         debug e
 168         # we might get a 404 because of trailing punctuation, so we try again
 169         # with the last character stripped. this might generate invalid URIs
 170         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
 171         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
 172           # chop off last non-word character from the unescaped version of
 173           # the URL, and retry if we still have enough string to look like a
 174           # minimal URL
 175           unescaped = URI.unescape(urlstr)
 176           debug "Unescaped: #{unescaped}"
 177           if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
 178             urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
 179             retry
 180           else
 181             debug "Not retrying #{unescaped}"
 182           end
 183         end
 184         reply = "Error #{e.message}"
 185       end
 186
 187       if display_info > urls_displayed
 188         if reply
 189           m.reply reply, :overlong => :truncate, :to => :public,
 190             :nick => (m.address? ? :auto : false)
 191           urls_displayed += 1
 192         end
 193       end
 194
 195       next unless list
 196
 197       # check to see if this url is already listed
 198       next if list.find {|u| u.url == urlstr }
 199
 200       url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
 201       debug "#{list.length} urls so far"
 202       list.pop if list.length > @bot.config['url.max_urls']
 203       debug "storing url #{url.url}"
 204       list.unshift url
 205       debug "#{list.length} urls now"
 206     end
 207     @registry[m.target] = list
 208   end
 209
 210   def info(m, params)
 211     escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
 212     urls = URI.extract(escaped)
 213     Thread.new do
 214       handle_urls(m,
 215                   :urls => urls,
 216                   :display_info => params[:urls].length,
 217                   :channels => [])
 218     end
 219   end
 220
 221   def message(m)
 222     return if m.address?
 223
 224     escaped = URI.escape(m.message, OUR_UNSAFE)
 225     urls = URI.extract(escaped, ['http', 'https'])
 226     return if urls.empty?
 227     Thread.new { handle_urls(m, :urls => urls) }
 228   end
 229
 230   def reply_urls(opts={})
 231     list = opts[:list]
 232     max = opts[:max]
 233     channel = opts[:channel]
 234     m = opts[:msg]
 235     return unless list and max and m
 236     list[0..(max-1)].each do |url|
 237       disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
 238       if @bot.config['url.info_on_list']
 239         title = url.info ||
 240           get_title_for_url(url.url,
 241                             :nick => url.nick, :channel => channel) rescue nil
 242         # If the url info was missing and we now have some, try to upgrade it
 243         if channel and title and not url.info
 244           ll = @registry[channel]
 245           debug ll
 246           if el = ll.find { |u| u.url == url.url }
 247             el.info = title
 248             @registry[channel] = ll
 249           end
 250         end
 251         disp << " --> #{title}" if title
 252       end
 253       m.reply disp, :overlong => :truncate
 254     end
 255   end
 256
 257   def urls(m, params)
 258     channel = params[:channel] ? params[:channel] : m.target
 259     max = params[:limit].to_i
 260     max = 10 if max > 10
 261     max = 1 if max < 1
 262     list = @registry[channel]
 263     if list.empty?
 264       m.reply "no urls seen yet for channel #{channel}"
 265     else
 266       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 267     end
 268   end
 269
 270   def search(m, params)
 271     channel = params[:channel] ? params[:channel] : m.target
 272     max = params[:limit].to_i
 273     string = params[:string]
 274     max = 10 if max > 10
 275     max = 1 if max < 1
 276     regex = Regexp.new(string, Regexp::IGNORECASE)
 277     list = @registry[channel].find_all {|url|
 278       regex.match(url.url) || regex.match(url.nick) ||
 279         (@bot.config['url.info_on_list'] && regex.match(url.info))
 280     }
 281     if list.empty?
 282       m.reply "no matches for channel #{channel}"
 283     else
 284       reply_urls :msg => m, :channel => channel, :list => list, :max => max
 285     end
 286   end
 287 end
 288
 289 plugin = UrlPlugin.new
 290 plugin.map 'urls info *urls', :action => 'info'
 291 plugin.map 'url info *urls', :action => 'info'
 292 plugin.map 'urls search :channel :limit :string', :action => 'search',
 293                           :defaults => {:limit => 4},
 294                           :requirements => {:limit => /^\d+$/},
 295                           :public => false
 296 plugin.map 'urls search :limit :string', :action => 'search',
 297                           :defaults => {:limit => 4},
 298                           :requirements => {:limit => /^\d+$/},
 299                           :private => false
 300 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
 301                           :requirements => {:limit => /^\d+$/},
 302                           :public => false
 303 plugin.map 'urls :limit', :defaults => {:limit => 4},
 304                           :requirements => {:limit => /^\d+$/},
 305                           :private => false