6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
31 Config.register Config::ArrayValue.new('url.only_on_channels',
32 :desc => "Show link info only on these channels",
34 Config.register Config::ArrayValue.new('url.ignore',
35 :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
40 @registry.set_default(Array.new)
41 unless @bot.config['url.display_link_info'].kind_of?(Integer)
42 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
45 self.filter_group = :htmlinfo
49 def reset_no_info_hosts
50 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
51 debug "no info hosts regexp set to #{@no_info_hosts}"
54 def help(plugin, topic="")
55 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
58 def get_title_from_html(pagedata)
59 return pagedata.ircify_html_title
62 def get_title_for_url(uri_str, opts = {})
64 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
65 return if url.scheme !~ /https?/
67 # also check the ip, the canonical name and the aliases
69 checks = TCPSocket.gethostbyname(url.host)
72 return "Unable to retrieve info for #{url.host}: #{e.message}"
78 unless checks.grep(@no_info_hosts).empty?
79 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
88 debug "+ getting info for #{url.request_uri}"
89 info = @bot.filter(:htmlinfo, url)
91 logopts[:htmlinfo] = info
94 logopts[:title] = title = info[:title]
97 logopts[:extra] = info[:content]
98 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
100 logopts[:extra] = String.new
101 logopts[:extra] << "Content Type: #{resp['content-type']}"
102 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
103 if enc = resp['content-encoding']
104 logopts[:extra] << ", encoding: #{enc}"
105 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
108 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
110 logopts[:extra] << ", size: #{size} bytes"
111 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
114 rescue Exception => e
120 raise "connecting to site/processing information (#{e.message})"
124 call_event(:url_added, url.to_s, logopts)
126 extra.unshift("#{Bold}title#{Bold}: #{title}")
128 return extra.join(", ") if title or not @bot.config['url.titles_only']
131 def handle_urls(m, params={})
133 :display_info => @bot.config['url.display_link_info'],
134 :channels => @bot.config['url.only_on_channels'],
135 :ignore => @bot.config['url.ignore']
138 display_info= opts[:display_info]
139 channels = opts[:channels]
140 ignore = opts[:ignore]
142 unless channels.empty?
143 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
146 ignore.each { |u| return if m.source.matches?(u) }
148 return if urls.empty?
149 debug "found urls #{urls.inspect}"
150 list = m.public? ? @registry[m.target] : nil
151 debug "display link info: #{display_info}"
153 urls.each do |urlstr|
154 debug "working on #{urlstr}"
155 next unless urlstr =~ /^https?:\/\/./
157 debug "Getting title for #{urlstr}..."
160 title = get_title_for_url(urlstr,
161 :always_reply => m.address?,
162 :nick => m.source.nick,
163 :channel => m.channel,
164 :ircline => m.message)
165 debug "Title #{title ? '' : 'not '} found"
166 reply = "#{LINK_INFO} #{title}" if title
169 # we might get a 404 because of trailing punctuation, so we try again
170 # with the last character stripped. this might generate invalid URIs
171 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
172 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
173 # chop off last non-word character from the unescaped version of
174 # the URL, and retry if we still have enough string to look like a
176 unescaped = URI.unescape(urlstr)
177 debug "Unescaped: #{unescaped}"
178 if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
179 urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
182 debug "Not retrying #{unescaped}"
185 reply = "Error #{e.message}"
188 if display_info > urls_displayed
190 m.reply reply, :overlong => :truncate, :to => :public,
191 :nick => (m.address? ? :auto : false)
198 # check to see if this url is already listed
199 next if list.find {|u| u.url == urlstr }
201 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
202 debug "#{list.length} urls so far"
203 list.pop if list.length > @bot.config['url.max_urls']
204 debug "storing url #{url.url}"
206 debug "#{list.length} urls now"
208 @registry[m.target] = list
212 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
213 urls = URI.extract(escaped)
217 :display_info => params[:urls].length,
225 escaped = URI.escape(m.message, OUR_UNSAFE)
226 urls = URI.extract(escaped, ['http', 'https'])
227 return if urls.empty?
228 Thread.new { handle_urls(m, :urls => urls) }
231 def reply_urls(opts={})
234 channel = opts[:channel]
236 return unless list and max and m
237 list[0..(max-1)].each do |url|
238 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
239 if @bot.config['url.info_on_list']
241 get_title_for_url(url.url,
242 :nick => url.nick, :channel => channel) rescue nil
243 # If the url info was missing and we now have some, try to upgrade it
244 if channel and title and not url.info
245 ll = @registry[channel]
247 if el = ll.find { |u| u.url == url.url }
249 @registry[channel] = ll
252 disp << " --> #{title}" if title
254 m.reply disp, :overlong => :truncate
259 channel = params[:channel] ? params[:channel] : m.target
260 max = params[:limit].to_i
263 list = @registry[channel]
265 m.reply "no urls seen yet for channel #{channel}"
267 reply_urls :msg => m, :channel => channel, :list => list, :max => max
271 def search(m, params)
272 channel = params[:channel] ? params[:channel] : m.target
273 max = params[:limit].to_i
274 string = params[:string]
277 regex = Regexp.new(string, Regexp::IGNORECASE)
278 list = @registry[channel].find_all {|url|
279 regex.match(url.url) || regex.match(url.nick) ||
280 (@bot.config['url.info_on_list'] && regex.match(url.info))
283 m.reply "no matches for channel #{channel}"
285 reply_urls :msg => m, :channel => channel, :list => list, :max => max
290 plugin = UrlPlugin.new
291 plugin.map 'urls info *urls', :action => 'info'
292 plugin.map 'url info *urls', :action => 'info'
293 plugin.map 'urls search :channel :limit :string', :action => 'search',
294 :defaults => {:limit => 4},
295 :requirements => {:limit => /^\d+$/},
297 plugin.map 'urls search :limit :string', :action => 'search',
298 :defaults => {:limit => 4},
299 :requirements => {:limit => /^\d+$/},
301 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
302 :requirements => {:limit => /^\d+$/},
304 plugin.map 'urls :limit', :defaults => {:limit => 4},
305 :requirements => {:limit => /^\d+$/},