6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
31 Config.register Config::ArrayValue.new('url.only_on_channels',
32 :desc => "Show link info only on these channels",
37 @registry.set_default(Array.new)
38 unless @bot.config['url.display_link_info'].kind_of?(Integer)
39 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
44 def reset_no_info_hosts
45 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
46 debug "no info hosts regexp set to #{@no_info_hosts}"
49 def help(plugin, topic="")
50 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
53 def get_title_from_html(pagedata)
54 return pagedata.ircify_html_title
57 def get_title_for_url(uri_str, opts = {})
59 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
60 return if url.scheme !~ /https?/
62 # also check the ip, the canonical name and the aliases
64 checks = TCPSocket.gethostbyname(url.host)
67 return "Unable to retrieve info for #{url.host}: #{e.message}"
73 unless checks.grep(@no_info_hosts).empty?
74 return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
83 debug "+ getting info for #{url.request_uri}"
84 info = @bot.filter(:htmlinfo, url)
88 logopts[:title] = title = info[:title]
91 logopts[:extra] = info[:content]
92 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
94 logopts[:extra] = String.new
95 logopts[:extra] << "Content Type: #{resp['content-type']}"
96 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
97 if enc = resp['content-encoding']
98 logopts[:extra] << ", encoding: #{enc}"
99 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
102 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
104 logopts[:extra] << ", size: #{size} bytes"
105 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
108 rescue Exception => e
114 raise "connecting to site/processing information (#{e.message})"
118 call_event(:url_added, url.to_s, logopts)
120 extra.unshift("#{Bold}title#{Bold}: #{title}")
122 return extra.join(", ") if title or not @bot.config['url.titles_only']
125 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
126 unless (channels = @bot.config['url.only_on_channels']).empty?
127 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
130 return if urls.empty?
131 debug "found urls #{urls.inspect}"
132 list = m.public? ? @registry[m.target] : nil
133 debug "display link info: #{display_info}"
135 urls.each do |urlstr|
136 debug "working on #{urlstr}"
137 next unless urlstr =~ /^https?:\/\/./
139 debug "Getting title for #{urlstr}..."
142 title = get_title_for_url(urlstr,
143 :nick => m.source.nick,
144 :channel => m.channel,
145 :ircline => m.message)
146 debug "Title #{title ? '' : 'not '} found"
147 reply = "#{LINK_INFO} #{title}" if title
150 # we might get a 404 because of trailing punctuation, so we try again
151 # with the last character stripped. this might generate invalid URIs
152 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
153 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
154 # chop off last character, and retry if we still have enough string to
155 # look like a minimal URL
156 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
158 reply = "Error #{e.message}"
161 if display_info > urls_displayed
163 m.plainreply(reply, :overlong => :truncate)
170 # check to see if this url is already listed
171 next if list.find {|u| u.url == urlstr }
173 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
174 debug "#{list.length} urls so far"
175 list.pop if list.length > @bot.config['url.max_urls']
176 debug "storing url #{url.url}"
178 debug "#{list.length} urls now"
180 @registry[m.target] = list
184 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
185 urls = URI.extract(escaped)
186 Thread.new { handle_urls(m, urls, params[:urls].length) }
192 escaped = URI.escape(m.message, OUR_UNSAFE)
193 urls = URI.extract(escaped, ['http', 'https'])
194 return if urls.empty?
195 Thread.new { handle_urls(m, urls) }
198 def reply_urls(opts={})
201 channel = opts[:channel]
203 return unless list and max and m
204 list[0..(max-1)].each do |url|
205 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
206 if @bot.config['url.info_on_list']
208 get_title_for_url(url.url,
209 :nick => url.nick, :channel => channel) rescue nil
210 # If the url info was missing and we now have some, try to upgrade it
211 if channel and title and not url.info
212 ll = @registry[channel]
214 if el = ll.find { |u| u.url == url.url }
216 @registry[channel] = ll
219 disp << " --> #{title}" if title
221 m.reply disp, :overlong => :truncate
226 channel = params[:channel] ? params[:channel] : m.target
227 max = params[:limit].to_i
230 list = @registry[channel]
232 m.reply "no urls seen yet for channel #{channel}"
234 reply_urls :msg => m, :channel => channel, :list => list, :max => max
238 def search(m, params)
239 channel = params[:channel] ? params[:channel] : m.target
240 max = params[:limit].to_i
241 string = params[:string]
244 regex = Regexp.new(string, Regexp::IGNORECASE)
245 list = @registry[channel].find_all {|url|
246 regex.match(url.url) || regex.match(url.nick) ||
247 (@bot.config['url.info_on_list'] && regex.match(url.info))
250 m.reply "no matches for channel #{channel}"
252 reply_urls :msg => m, :channel => channel, :list => list, :max => max
257 plugin = UrlPlugin.new
258 plugin.map 'urls info *urls', :action => 'info'
259 plugin.map 'url info *urls', :action => 'info'
260 plugin.map 'urls search :channel :limit :string', :action => 'search',
261 :defaults => {:limit => 4},
262 :requirements => {:limit => /^\d+$/},
264 plugin.map 'urls search :limit :string', :action => 'search',
265 :defaults => {:limit => 4},
266 :requirements => {:limit => /^\d+$/},
268 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
269 :requirements => {:limit => /^\d+$/},
271 plugin.map 'urls :limit', :defaults => {:limit => 4},
272 :requirements => {:limit => /^\d+$/},