6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
31 Config.register Config::ArrayValue.new('url.only_on_channels',
32 :desc => "Show link info only on these channels",
37 @registry.set_default(Array.new)
38 unless @bot.config['url.display_link_info'].kind_of?(Integer)
39 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
44 def reset_no_info_hosts
45 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
46 debug "no info hosts regexp set to #{@no_info_hosts}"
49 def help(plugin, topic="")
50 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
53 def get_title_from_html(pagedata)
54 return pagedata.ircify_html_title
57 def get_title_for_url(uri_str, opts = {})
59 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
60 return if url.scheme !~ /https?/
62 # also check the ip, the canonical name and the aliases
64 checks = TCPSocket.gethostbyname(url.host)
67 return "Unable to retrieve info for #{url.host}: #{e.message}"
73 unless checks.grep(@no_info_hosts).empty?
74 return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
83 debug "+ getting info for #{url.request_uri}"
84 info = @bot.filter(:htmlinfo, url)
88 logopts[:title] = title = info[:title]
91 logopts[:extra] = info[:content]
92 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
94 logopts[:extra] = String.new
95 logopts[:extra] << "Content Type: #{resp['content-type']}"
96 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
97 if enc = resp['content-encoding']
98 logopts[:extra] << ", encoding: #{enc}"
99 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
102 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
104 logopts[:extra] << ", size: #{size} bytes"
105 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
108 rescue Exception => e
114 raise "connecting to site/processing information (#{e.message})"
118 call_event(:url_added, url.to_s, logopts)
120 extra.unshift("#{Bold}title#{Bold}: #{title}")
122 return extra.join(", ") if title or not @bot.config['url.titles_only']
125 def handle_urls(m, params={})
127 :display_info => @bot.config['url.display_link_info']
130 display_info= opts[:display_info]
131 unless (channels = @bot.config['url.only_on_channels']).empty?
132 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
135 return if urls.empty?
136 debug "found urls #{urls.inspect}"
137 list = m.public? ? @registry[m.target] : nil
138 debug "display link info: #{display_info}"
140 urls.each do |urlstr|
141 debug "working on #{urlstr}"
142 next unless urlstr =~ /^https?:\/\/./
144 debug "Getting title for #{urlstr}..."
147 title = get_title_for_url(urlstr,
148 :nick => m.source.nick,
149 :channel => m.channel,
150 :ircline => m.message)
151 debug "Title #{title ? '' : 'not '} found"
152 reply = "#{LINK_INFO} #{title}" if title
155 # we might get a 404 because of trailing punctuation, so we try again
156 # with the last character stripped. this might generate invalid URIs
157 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
158 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
159 # chop off last character, and retry if we still have enough string to
160 # look like a minimal URL
161 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
163 reply = "Error #{e.message}"
166 if display_info > urls_displayed
168 m.reply reply, :overlong => :truncate, :to => :public,
169 :nick => (m.address? ? :auto : false)
176 # check to see if this url is already listed
177 next if list.find {|u| u.url == urlstr }
179 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
180 debug "#{list.length} urls so far"
181 list.pop if list.length > @bot.config['url.max_urls']
182 debug "storing url #{url.url}"
184 debug "#{list.length} urls now"
186 @registry[m.target] = list
190 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
191 urls = URI.extract(escaped)
192 Thread.new { handle_urls(m, :urls => urls, :display_info => params[:urls].length) }
198 escaped = URI.escape(m.message, OUR_UNSAFE)
199 urls = URI.extract(escaped, ['http', 'https'])
200 return if urls.empty?
201 Thread.new { handle_urls(m, :urls => urls) }
204 def reply_urls(opts={})
207 channel = opts[:channel]
209 return unless list and max and m
210 list[0..(max-1)].each do |url|
211 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
212 if @bot.config['url.info_on_list']
214 get_title_for_url(url.url,
215 :nick => url.nick, :channel => channel) rescue nil
216 # If the url info was missing and we now have some, try to upgrade it
217 if channel and title and not url.info
218 ll = @registry[channel]
220 if el = ll.find { |u| u.url == url.url }
222 @registry[channel] = ll
225 disp << " --> #{title}" if title
227 m.reply disp, :overlong => :truncate
232 channel = params[:channel] ? params[:channel] : m.target
233 max = params[:limit].to_i
236 list = @registry[channel]
238 m.reply "no urls seen yet for channel #{channel}"
240 reply_urls :msg => m, :channel => channel, :list => list, :max => max
244 def search(m, params)
245 channel = params[:channel] ? params[:channel] : m.target
246 max = params[:limit].to_i
247 string = params[:string]
250 regex = Regexp.new(string, Regexp::IGNORECASE)
251 list = @registry[channel].find_all {|url|
252 regex.match(url.url) || regex.match(url.nick) ||
253 (@bot.config['url.info_on_list'] && regex.match(url.info))
256 m.reply "no matches for channel #{channel}"
258 reply_urls :msg => m, :channel => channel, :list => list, :max => max
263 plugin = UrlPlugin.new
264 plugin.map 'urls info *urls', :action => 'info'
265 plugin.map 'url info *urls', :action => 'info'
266 plugin.map 'urls search :channel :limit :string', :action => 'search',
267 :defaults => {:limit => 4},
268 :requirements => {:limit => /^\d+$/},
270 plugin.map 'urls search :limit :string', :action => 'search',
271 :defaults => {:limit => 4},
272 :requirements => {:limit => /^\d+$/},
274 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
275 :requirements => {:limit => /^\d+$/},
277 plugin.map 'urls :limit', :defaults => {:limit => 4},
278 :requirements => {:limit => /^\d+$/},