6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
31 Config.register Config::ArrayValue.new('url.only_on_channels',
32 :desc => "Show link info only on these channels",
37 @registry.set_default(Array.new)
38 unless @bot.config['url.display_link_info'].kind_of?(Integer)
39 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
44 def reset_no_info_hosts
45 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
46 debug "no info hosts regexp set to #{@no_info_hosts}"
49 def help(plugin, topic="")
50 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
53 def get_title_from_html(pagedata)
54 return pagedata.ircify_html_title
57 def get_title_for_url(uri_str, opts = {})
59 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
60 return if url.scheme !~ /https?/
62 # also check the ip, the canonical name and the aliases
64 checks = TCPSocket.gethostbyname(url.host)
67 return "Unable to retrieve info for #{url.host}: #{e.message}"
73 unless checks.grep(@no_info_hosts).empty?
74 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
83 debug "+ getting info for #{url.request_uri}"
84 info = @bot.filter(:htmlinfo, url)
88 logopts[:title] = title = info[:title]
91 logopts[:extra] = info[:content]
92 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
94 logopts[:extra] = String.new
95 logopts[:extra] << "Content Type: #{resp['content-type']}"
96 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
97 if enc = resp['content-encoding']
98 logopts[:extra] << ", encoding: #{enc}"
99 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
102 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
104 logopts[:extra] << ", size: #{size} bytes"
105 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
108 rescue Exception => e
114 raise "connecting to site/processing information (#{e.message})"
118 call_event(:url_added, url.to_s, logopts)
120 extra.unshift("#{Bold}title#{Bold}: #{title}")
122 return extra.join(", ") if title or not @bot.config['url.titles_only']
125 def handle_urls(m, params={})
127 :display_info => @bot.config['url.display_link_info'],
128 :channels => @bot.config['url.only_on_channels']
131 display_info= opts[:display_info]
132 channels = opts[:channels]
133 unless channels.empty?
134 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
137 return if urls.empty?
138 debug "found urls #{urls.inspect}"
139 list = m.public? ? @registry[m.target] : nil
140 debug "display link info: #{display_info}"
142 urls.each do |urlstr|
143 debug "working on #{urlstr}"
144 next unless urlstr =~ /^https?:\/\/./
146 debug "Getting title for #{urlstr}..."
149 title = get_title_for_url(urlstr,
150 :always_reply => m.address?,
151 :nick => m.source.nick,
152 :channel => m.channel,
153 :ircline => m.message)
154 debug "Title #{title ? '' : 'not '} found"
155 reply = "#{LINK_INFO} #{title}" if title
158 # we might get a 404 because of trailing punctuation, so we try again
159 # with the last character stripped. this might generate invalid URIs
160 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
161 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
162 # chop off last character, and retry if we still have enough string to
163 # look like a minimal URL
164 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
166 reply = "Error #{e.message}"
169 if display_info > urls_displayed
171 m.reply reply, :overlong => :truncate, :to => :public,
172 :nick => (m.address? ? :auto : false)
179 # check to see if this url is already listed
180 next if list.find {|u| u.url == urlstr }
182 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
183 debug "#{list.length} urls so far"
184 list.pop if list.length > @bot.config['url.max_urls']
185 debug "storing url #{url.url}"
187 debug "#{list.length} urls now"
189 @registry[m.target] = list
193 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
194 urls = URI.extract(escaped)
198 :display_info => params[:urls].length,
206 escaped = URI.escape(m.message, OUR_UNSAFE)
207 urls = URI.extract(escaped, ['http', 'https'])
208 return if urls.empty?
209 Thread.new { handle_urls(m, :urls => urls) }
212 def reply_urls(opts={})
215 channel = opts[:channel]
217 return unless list and max and m
218 list[0..(max-1)].each do |url|
219 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
220 if @bot.config['url.info_on_list']
222 get_title_for_url(url.url,
223 :nick => url.nick, :channel => channel) rescue nil
224 # If the url info was missing and we now have some, try to upgrade it
225 if channel and title and not url.info
226 ll = @registry[channel]
228 if el = ll.find { |u| u.url == url.url }
230 @registry[channel] = ll
233 disp << " --> #{title}" if title
235 m.reply disp, :overlong => :truncate
240 channel = params[:channel] ? params[:channel] : m.target
241 max = params[:limit].to_i
244 list = @registry[channel]
246 m.reply "no urls seen yet for channel #{channel}"
248 reply_urls :msg => m, :channel => channel, :list => list, :max => max
252 def search(m, params)
253 channel = params[:channel] ? params[:channel] : m.target
254 max = params[:limit].to_i
255 string = params[:string]
258 regex = Regexp.new(string, Regexp::IGNORECASE)
259 list = @registry[channel].find_all {|url|
260 regex.match(url.url) || regex.match(url.nick) ||
261 (@bot.config['url.info_on_list'] && regex.match(url.info))
264 m.reply "no matches for channel #{channel}"
266 reply_urls :msg => m, :channel => channel, :list => list, :max => max
271 plugin = UrlPlugin.new
272 plugin.map 'urls info *urls', :action => 'info'
273 plugin.map 'url info *urls', :action => 'info'
274 plugin.map 'urls search :channel :limit :string', :action => 'search',
275 :defaults => {:limit => 4},
276 :requirements => {:limit => /^\d+$/},
278 plugin.map 'urls search :limit :string', :action => 'search',
279 :defaults => {:limit => 4},
280 :requirements => {:limit => /^\d+$/},
282 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
283 :requirements => {:limit => /^\d+$/},
285 plugin.map 'urls :limit', :defaults => {:limit => 4},
286 :requirements => {:limit => /^\d+$/},