6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.auto_shorten',
20 :desc => "Automatically spit out shortened URLs when they're seen. Check shortenurls for config options")
21 Config.register Config::IntegerValue.new('url.auto_shorten_min_length',
23 :desc => "Minimum length of URL to auto-shorten. Only has an effect when url.auto_shorten is true.")
24 Config.register Config::BooleanValue.new('url.titles_only',
26 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
27 Config.register Config::BooleanValue.new('url.first_par',
29 :desc => "Also try to get the first paragraph of a web page")
30 Config.register Config::IntegerValue.new('url.first_par_length',
32 :desc => "The max length of the first paragraph")
33 Config.register Config::ArrayValue.new('url.first_par_whitelist',
34 :default => ['twitter.com'],
35 :desc => "List of url patterns to show the content for.")
36 Config.register Config::BooleanValue.new('url.info_on_list',
38 :desc => "Show link info when listing/searching for urls")
39 Config.register Config::ArrayValue.new('url.no_info_hosts',
40 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
41 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
42 :desc => "A list of regular expressions matching hosts for which no info should be provided")
43 Config.register Config::ArrayValue.new('url.only_on_channels',
44 :desc => "Show link info only on these channels",
46 Config.register Config::ArrayValue.new('url.ignore',
47 :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
52 @registry.set_default(Array.new)
53 unless @bot.config['url.display_link_info'].kind_of?(Integer)
54 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
57 self.filter_group = :htmlinfo
61 def reset_no_info_hosts
62 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
63 debug "no info hosts regexp set to #{@no_info_hosts}"
66 def help(plugin, topic="")
67 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
70 def get_title_from_html(pagedata)
71 return pagedata.ircify_html_title
74 def get_title_for_url(uri_str, opts = {})
76 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
77 return if url.scheme !~ /https?/
79 # also check the ip, the canonical name and the aliases
81 checks = TCPSocket.gethostbyname(url.host)
84 return "Unable to retrieve info for #{url.host}: #{e.message}"
90 unless checks.grep(@no_info_hosts).empty?
91 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
100 debug "+ getting info for #{url.request_uri}"
101 info = @bot.filter(:htmlinfo, url)
102 logopts[:htmlinfo] = info
103 resp = info[:headers]
105 logopts[:title] = title = info[:title]
108 logopts[:extra] = info[:content]
110 max_length = @bot.config['url.first_par_length']
112 whitelist = @bot.config['url.first_par_whitelist']
114 if whitelist.length > 0
115 whitelist.each do |pattern|
116 if Regexp.new(pattern, Regexp::IGNORECASE).match(url.to_s)
117 content = info[:content][0...max_length]
122 content = info[:content][0...max_length]
125 extra << "#{Bold}text#{Bold}: #{content}" if @bot.config['url.first_par'] and content
127 logopts[:extra] = String.new
128 logopts[:extra] << "Content Type: #{resp['content-type']}"
129 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
130 if enc = resp['content-encoding']
131 logopts[:extra] << ", encoding: #{enc}"
132 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
135 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
137 logopts[:extra] << ", size: #{size} bytes"
138 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
141 rescue Exception => e
147 raise "connecting to site/processing information (#{e.message})"
151 call_event(:url_added, url.to_s, logopts)
153 extra.unshift("#{Bold}title#{Bold}: #{title}")
155 return extra.join(", ") if title or not @bot.config['url.titles_only']
158 def handle_urls(m, params={})
160 :display_info => @bot.config['url.display_link_info'],
161 :channels => @bot.config['url.only_on_channels'],
162 :ignore => @bot.config['url.ignore']
165 display_info= opts[:display_info]
166 channels = opts[:channels]
167 ignore = opts[:ignore]
169 unless channels.empty?
170 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
173 ignore.each { |u| return if m.source.matches?(u) }
175 return if urls.empty?
176 debug "found urls #{urls.inspect}"
177 list = m.public? ? @registry[m.target] : nil
178 debug "display link info: #{display_info}"
180 urls.each do |urlstr|
181 debug "working on #{urlstr}"
182 next unless urlstr =~ /^https?:\/\/./
183 if @bot.config['url.auto_shorten'] == true and
184 urlstr.length >= @bot.config['url.auto_shorten_min_length']
185 m.reply(bot.plugins['shortenurls'].shorten(nil, {:url=>urlstr, :called=>true}))
189 debug "Getting title for #{urlstr}..."
192 title = get_title_for_url(urlstr,
193 :always_reply => m.address?,
194 :nick => m.source.nick,
195 :channel => m.channel,
196 :ircline => m.message)
197 debug "Title #{title ? '' : 'not '} found"
198 reply = "#{LINK_INFO} #{title}" if title
201 # we might get a 404 because of trailing punctuation, so we try again
202 # with the last character stripped. this might generate invalid URIs
203 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
204 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
205 # chop off last non-word character from the unescaped version of
206 # the URL, and retry if we still have enough string to look like a
208 unescaped = URI.unescape(urlstr)
209 debug "Unescaped: #{unescaped}"
210 if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
211 urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
214 debug "Not retrying #{unescaped}"
217 reply = "Error #{e.message}"
220 if display_info > urls_displayed
222 m.reply reply, :overlong => :truncate, :to => :public,
223 :nick => (m.address? ? :auto : false)
230 # check to see if this url is already listed
231 next if list.find {|u| u.url == urlstr }
233 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
234 debug "#{list.length} urls so far"
235 list.pop if list.length > @bot.config['url.max_urls']
236 debug "storing url #{url.url}"
238 debug "#{list.length} urls now"
240 @registry[m.target] = list
244 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
245 urls = URI.extract(escaped)
249 :display_info => params[:urls].length,
257 escaped = URI.escape(m.message, OUR_UNSAFE)
258 urls = URI.extract(escaped, ['http', 'https'])
259 return if urls.empty?
260 Thread.new { handle_urls(m, :urls => urls) }
263 def reply_urls(opts={})
266 channel = opts[:channel]
268 return unless list and max and m
269 list[0..(max-1)].each do |url|
270 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
271 if @bot.config['url.info_on_list']
273 get_title_for_url(url.url,
274 :nick => url.nick, :channel => channel) rescue nil
275 # If the url info was missing and we now have some, try to upgrade it
276 if channel and title and not url.info
277 ll = @registry[channel]
279 if el = ll.find { |u| u.url == url.url }
281 @registry[channel] = ll
284 disp << " --> #{title}" if title
286 m.reply disp, :overlong => :truncate
291 channel = params[:channel] ? params[:channel] : m.target
292 max = params[:limit].to_i
295 list = @registry[channel]
297 m.reply "no urls seen yet for channel #{channel}"
299 reply_urls :msg => m, :channel => channel, :list => list, :max => max
303 def search(m, params)
304 channel = params[:channel] ? params[:channel] : m.target
305 max = params[:limit].to_i
306 string = params[:string]
309 regex = Regexp.new(string, Regexp::IGNORECASE)
310 list = @registry[channel].find_all {|url|
311 regex.match(url.url) || regex.match(url.nick) ||
312 (@bot.config['url.info_on_list'] && regex.match(url.info))
315 m.reply "no matches for channel #{channel}"
317 reply_urls :msg => m, :channel => channel, :list => list, :max => max
322 plugin = UrlPlugin.new
323 plugin.map 'urls info *urls', :action => 'info'
324 plugin.map 'url info *urls', :action => 'info'
325 plugin.map 'urls search :channel :limit :string', :action => 'search',
326 :defaults => {:limit => 4},
327 :requirements => {:limit => /^\d+$/},
329 plugin.map 'urls search :limit :string', :action => 'search',
330 :defaults => {:limit => 4},
331 :requirements => {:limit => /^\d+$/},
333 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
334 :requirements => {:limit => /^\d+$/},
336 plugin.map 'urls :limit', :defaults => {:limit => 4},
337 :requirements => {:limit => /^\d+$/},