6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.auto_shorten',
20 :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
21 Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
23 :desc => "Minimum length of URL to auto-shorten. Only has an effect when url.auto_shorten is true.")
24 Config.register Config::BooleanValue.new('url.titles_only',
26 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
27 Config.register Config::BooleanValue.new('url.first_par',
29 :desc => "Also try to get the first paragraph of a web page")
30 Config.register Config::IntegerValue.new('url.first_par_length',
32 :desc => "The max length of the first paragraph")
33 Config.register Config::ArrayValue.new('url.first_par_whitelist',
34 :default => ['twitter.com'],
35 :desc => "List of url patterns to show the content for.")
36 Config.register Config::BooleanValue.new('url.info_on_list',
38 :desc => "Show link info when listing/searching for urls")
39 Config.register Config::ArrayValue.new('url.no_info_hosts',
40 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
41 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
42 :desc => "A list of regular expressions matching hosts for which no info should be provided")
43 Config.register Config::ArrayValue.new('url.only_on_channels',
44 :desc => "Show link info only on these channels",
46 Config.register Config::ArrayValue.new('url.ignore',
47 :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
52 @registry.set_default(Array.new)
53 unless @bot.config['url.display_link_info'].kind_of?(Integer)
54 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
57 self.filter_group = :htmlinfo
61 def reset_no_info_hosts
62 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
63 debug "no info hosts regexp set to #{@no_info_hosts}"
66 def help(plugin, topic="")
67 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
70 def get_title_from_html(pagedata)
71 return pagedata.ircify_html_title
74 def get_title_for_url(uri_str, opts = {})
76 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
77 return if url.scheme !~ /https?/
79 # also check the ip, the canonical name and the aliases
81 checks = TCPSocket.gethostbyname(url.host)
84 return "Unable to retrieve info for #{url.host}: #{e.message}"
90 unless checks.grep(@no_info_hosts).empty?
91 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
100 debug "+ getting info for #{url.request_uri}"
101 info = @bot.filter(:htmlinfo, url)
102 logopts[:htmlinfo] = info
103 resp = info[:headers]
105 logopts[:title] = title = info[:title]
108 logopts[:extra] = info[:content]
110 max_length = @bot.config['url.first_par_length']
112 whitelist = @bot.config['url.first_par_whitelist']
114 if whitelist.length > 0
115 whitelist.each do |pattern|
116 if Regexp.new(pattern, Regexp::IGNORECASE).match(url.to_s)
117 content = info[:content][0...max_length]
122 content = info[:content][0...max_length]
125 extra << "#{Bold}text#{Bold}: #{content}" if @bot.config['url.first_par'] and content
127 logopts[:extra] = String.new
128 logopts[:extra] << "Content Type: #{resp['content-type']}"
129 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
130 if enc = resp['content-encoding']
131 logopts[:extra] << ", encoding: #{enc}"
132 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
135 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
137 logopts[:extra] << ", size: #{size} bytes"
138 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
141 rescue Exception => e
147 raise "connecting to site/processing information (#{e.message})"
151 call_event(:url_added, url.to_s, logopts)
153 extra.unshift("#{Bold}title#{Bold}: #{title}")
155 return extra.join(", ") if title or not @bot.config['url.titles_only']
158 def handle_urls(m, params={})
160 :display_info => @bot.config['url.display_link_info'],
161 :channels => @bot.config['url.only_on_channels'],
162 :ignore => @bot.config['url.ignore']
165 display_info= opts[:display_info]
166 channels = opts[:channels]
167 ignore = opts[:ignore]
169 unless channels.empty?
170 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
173 ignore.each { |u| return if m.source.matches?(u) }
175 return if urls.empty?
176 debug "found urls #{urls.inspect}"
177 list = m.public? ? @registry[m.target] : nil
178 debug "display link info: #{display_info}"
180 urls.each do |urlstr|
181 debug "working on #{urlstr}"
182 next unless urlstr =~ /^https?:\/\/./
183 if @bot.config['url.auto_shorten'] == true and
184 urlstr.length >= @bot.config['url.auto_shorten_min_length']
185 m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
189 debug "Getting title for #{urlstr}..."
192 title = get_title_for_url(urlstr,
193 :always_reply => m.address?,
194 :nick => m.source.nick,
195 :channel => m.channel,
196 :ircline => m.message)
197 debug "Title #{title ? '' : 'not '} found"
198 reply = "#{LINK_INFO} #{title}" if title
201 # we might get a 404 because of trailing punctuation, so we try again
202 # with the last character stripped. this might generate invalid URIs
203 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
204 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
205 # chop off last non-word character from the unescaped version of
206 # the URL, and retry if we still have enough string to look like a
208 unescaped = URI.unescape(urlstr)
209 debug "Unescaped: #{unescaped}"
210 if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
211 urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
214 debug "Not retrying #{unescaped}"
217 reply = "Error #{e.message}"
220 if display_info > urls_displayed
222 m.reply reply, :overlong => :truncate, :to => :public,
223 :nick => (m.address? ? :auto : false)
230 # check to see if this url is already listed
231 next if list.find {|u| u.url == urlstr }
233 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
234 debug "#{list.length} urls so far"
235 list.pop if list.length > @bot.config['url.max_urls']
236 debug "storing url #{url.url}"
238 debug "#{list.length} urls now"
240 @registry[m.target] = list
244 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
245 urls = URI.extract(escaped)
249 :display_info => params[:urls].length,
257 urls = URI.extract(m.message, ['http', 'https'])
258 return if urls.empty?
259 Thread.new { handle_urls(m, :urls => urls) }
262 def reply_urls(opts={})
265 channel = opts[:channel]
267 return unless list and max and m
268 list[0..(max-1)].each do |url|
269 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
270 if @bot.config['url.info_on_list']
272 get_title_for_url(url.url,
273 :nick => url.nick, :channel => channel) rescue nil
274 # If the url info was missing and we now have some, try to upgrade it
275 if channel and title and not url.info
276 ll = @registry[channel]
278 if el = ll.find { |u| u.url == url.url }
280 @registry[channel] = ll
283 disp << " --> #{title}" if title
285 m.reply disp, :overlong => :truncate
290 channel = params[:channel] ? params[:channel] : m.target
291 max = params[:limit].to_i
294 list = @registry[channel]
296 m.reply "no urls seen yet for channel #{channel}"
298 reply_urls :msg => m, :channel => channel, :list => list, :max => max
302 def search(m, params)
303 channel = params[:channel] ? params[:channel] : m.target
304 max = params[:limit].to_i
305 string = params[:string]
308 regex = Regexp.new(string, Regexp::IGNORECASE)
309 list = @registry[channel].find_all {|url|
310 regex.match(url.url) || regex.match(url.nick) ||
311 (@bot.config['url.info_on_list'] && regex.match(url.info))
314 m.reply "no matches for channel #{channel}"
316 reply_urls :msg => m, :channel => channel, :list => list, :max => max
321 plugin = UrlPlugin.new
322 plugin.map 'urls info *urls', :action => 'info'
323 plugin.map 'url info *urls', :action => 'info'
324 plugin.map 'urls search :channel :limit :string', :action => 'search',
325 :defaults => {:limit => 4},
326 :requirements => {:limit => /^\d+$/},
328 plugin.map 'urls search :limit :string', :action => 'search',
329 :defaults => {:limit => 4},
330 :requirements => {:limit => /^\d+$/},
332 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
333 :requirements => {:limit => /^\d+$/},
335 plugin.map 'urls :limit', :defaults => {:limit => 4},
336 :requirements => {:limit => /^\d+$/},