6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.auto_shorten',
20 :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
21 Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
23 :desc => "Minimum length of URL to auto-shorten. Only has an effect when url.auto_shorten is true.")
24 Config.register Config::BooleanValue.new('url.titles_only',
26 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
27 Config.register Config::BooleanValue.new('url.first_par',
29 :desc => "Also try to get the first paragraph of a web page")
30 Config.register Config::BooleanValue.new('url.info_on_list',
32 :desc => "Show link info when listing/searching for urls")
33 Config.register Config::ArrayValue.new('url.no_info_hosts',
34 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
35 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
36 :desc => "A list of regular expressions matching hosts for which no info should be provided")
37 Config.register Config::ArrayValue.new('url.only_on_channels',
38 :desc => "Show link info only on these channels",
40 Config.register Config::ArrayValue.new('url.ignore',
41 :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
46 @registry.set_default(Array.new)
47 unless @bot.config['url.display_link_info'].kind_of?(Integer)
48 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
51 self.filter_group = :htmlinfo
55 def reset_no_info_hosts
56 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
57 debug "no info hosts regexp set to #{@no_info_hosts}"
60 def help(plugin, topic="")
61 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
64 def get_title_from_html(pagedata)
65 return pagedata.ircify_html_title
68 def get_title_for_url(uri_str, opts = {})
70 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
71 return if url.scheme !~ /https?/
73 # also check the ip, the canonical name and the aliases
75 checks = TCPSocket.gethostbyname(url.host)
78 return "Unable to retrieve info for #{url.host}: #{e.message}"
84 unless checks.grep(@no_info_hosts).empty?
85 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
94 debug "+ getting info for #{url.request_uri}"
95 info = @bot.filter(:htmlinfo, url)
97 logopts[:htmlinfo] = info
100 logopts[:title] = title = info[:title]
103 logopts[:extra] = info[:content]
104 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
106 logopts[:extra] = String.new
107 logopts[:extra] << "Content Type: #{resp['content-type']}"
108 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
109 if enc = resp['content-encoding']
110 logopts[:extra] << ", encoding: #{enc}"
111 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
114 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
116 logopts[:extra] << ", size: #{size} bytes"
117 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
120 rescue Exception => e
126 raise "connecting to site/processing information (#{e.message})"
130 call_event(:url_added, url.to_s, logopts)
132 extra.unshift("#{Bold}title#{Bold}: #{title}")
134 return extra.join(", ") if title or not @bot.config['url.titles_only']
137 def handle_urls(m, params={})
139 :display_info => @bot.config['url.display_link_info'],
140 :channels => @bot.config['url.only_on_channels'],
141 :ignore => @bot.config['url.ignore']
144 display_info= opts[:display_info]
145 channels = opts[:channels]
146 ignore = opts[:ignore]
148 unless channels.empty?
149 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
152 ignore.each { |u| return if m.source.matches?(u) }
154 return if urls.empty?
155 debug "found urls #{urls.inspect}"
156 list = m.public? ? @registry[m.target] : nil
157 debug "display link info: #{display_info}"
159 urls.each do |urlstr|
160 debug "working on #{urlstr}"
161 next unless urlstr =~ /^https?:\/\/./
162 if @bot.config['url.auto_shorten'] == true and
163 urlstr.length >= @bot.config['url.auto_shorten_min_length']
164 m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
168 debug "Getting title for #{urlstr}..."
171 title = get_title_for_url(urlstr,
172 :always_reply => m.address?,
173 :nick => m.source.nick,
174 :channel => m.channel,
175 :ircline => m.message)
176 debug "Title #{title ? '' : 'not '} found"
177 reply = "#{LINK_INFO} #{title}" if title
180 # we might get a 404 because of trailing punctuation, so we try again
181 # with the last character stripped. this might generate invalid URIs
182 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
183 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
184 # chop off last non-word character from the unescaped version of
185 # the URL, and retry if we still have enough string to look like a
187 unescaped = URI.unescape(urlstr)
188 debug "Unescaped: #{unescaped}"
189 if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
190 urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
193 debug "Not retrying #{unescaped}"
196 reply = "Error #{e.message}"
199 if display_info > urls_displayed
201 m.reply reply, :overlong => :truncate, :to => :public,
202 :nick => (m.address? ? :auto : false)
209 # check to see if this url is already listed
210 next if list.find {|u| u.url == urlstr }
212 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
213 debug "#{list.length} urls so far"
214 list.pop if list.length > @bot.config['url.max_urls']
215 debug "storing url #{url.url}"
217 debug "#{list.length} urls now"
219 @registry[m.target] = list
223 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
224 urls = URI.extract(escaped)
228 :display_info => params[:urls].length,
236 escaped = URI.escape(m.message, OUR_UNSAFE)
237 urls = URI.extract(escaped, ['http', 'https'])
238 return if urls.empty?
239 Thread.new { handle_urls(m, :urls => urls) }
242 def reply_urls(opts={})
245 channel = opts[:channel]
247 return unless list and max and m
248 list[0..(max-1)].each do |url|
249 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
250 if @bot.config['url.info_on_list']
252 get_title_for_url(url.url,
253 :nick => url.nick, :channel => channel) rescue nil
254 # If the url info was missing and we now have some, try to upgrade it
255 if channel and title and not url.info
256 ll = @registry[channel]
258 if el = ll.find { |u| u.url == url.url }
260 @registry[channel] = ll
263 disp << " --> #{title}" if title
265 m.reply disp, :overlong => :truncate
270 channel = params[:channel] ? params[:channel] : m.target
271 max = params[:limit].to_i
274 list = @registry[channel]
276 m.reply "no urls seen yet for channel #{channel}"
278 reply_urls :msg => m, :channel => channel, :list => list, :max => max
282 def search(m, params)
283 channel = params[:channel] ? params[:channel] : m.target
284 max = params[:limit].to_i
285 string = params[:string]
288 regex = Regexp.new(string, Regexp::IGNORECASE)
289 list = @registry[channel].find_all {|url|
290 regex.match(url.url) || regex.match(url.nick) ||
291 (@bot.config['url.info_on_list'] && regex.match(url.info))
294 m.reply "no matches for channel #{channel}"
296 reply_urls :msg => m, :channel => channel, :list => list, :max => max
301 plugin = UrlPlugin.new
302 plugin.map 'urls info *urls', :action => 'info'
303 plugin.map 'url info *urls', :action => 'info'
304 plugin.map 'urls search :channel :limit :string', :action => 'search',
305 :defaults => {:limit => 4},
306 :requirements => {:limit => /^\d+$/},
308 plugin.map 'urls search :limit :string', :action => 'search',
309 :defaults => {:limit => 4},
310 :requirements => {:limit => /^\d+$/},
312 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
313 :requirements => {:limit => /^\d+$/},
315 plugin.map 'urls :limit', :defaults => {:limit => 4},
316 :requirements => {:limit => /^\d+$/},