6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
35 @registry.set_default(Array.new)
36 unless @bot.config['url.display_link_info'].kind_of?(Integer)
37 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
42 def reset_no_info_hosts
43 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
44 debug "no info hosts regexp set to #{@no_info_hosts}"
47 def help(plugin, topic="")
48 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
51 def get_title_from_html(pagedata)
52 return pagedata.ircify_html_title
55 def get_title_for_url(uri_str, opts = {})
57 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
58 return if url.scheme !~ /https?/
60 # also check the ip, the canonical name and the aliases
62 checks = TCPSocket.gethostbyname(url.host)
65 return "Unable to retrieve info for #{url.host}: #{e.message}"
71 unless checks.grep(@no_info_hosts).empty?
72 return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
81 debug "+ getting info for #{url.request_uri}"
82 info = Utils.get_html_info(url)
86 logopts[:title] = title = info[:title]
89 logopts[:extra] = info[:content]
90 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
92 logopts[:extra] = String.new
93 logopts[:extra] << "Content Type: #{resp['content-type']}"
94 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
95 if enc = resp['content-encoding']
96 logopts[:extra] << ", encoding: #{enc}"
97 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
100 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
102 logopts[:extra] << ", size: #{size} bytes"
103 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
106 rescue Exception => e
112 raise "connecting to site/processing information (#{e.message})"
116 call_event(:url_added, url.to_s, logopts)
118 extra.unshift("#{Bold}title#{Bold}: #{title}")
120 return extra.join(", ") if title or not @bot.config['url.titles_only']
123 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
124 return if urls.empty?
125 debug "found urls #{urls.inspect}"
126 list = m.public? ? @registry[m.target] : nil
127 debug "display link info: #{display_info}"
129 urls.each do |urlstr|
130 debug "working on #{urlstr}"
131 next unless urlstr =~ /^https?:/
133 debug "Getting title for #{urlstr}..."
136 title = get_title_for_url(urlstr,
137 :nick => m.source.nick,
138 :channel => m.channel,
139 :ircline => m.message)
140 debug "Title #{title ? '' : 'not '} found"
141 reply = "#{LINK_INFO} #{title}" if title
143 if e.message =~ /\(404 - Not Found\)/i
144 # see if we failed to find the thing because of trailing punctuation
145 # but check that we still have 'something' in the URL
146 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
148 reply = "Error #{e.message}"
151 if display_info > urls_displayed
153 m.plainreply(reply, :overlong => :truncate)
160 # check to see if this url is already listed
161 next if list.find {|u| u.url == urlstr }
163 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
164 debug "#{list.length} urls so far"
165 list.pop if list.length > @bot.config['url.max_urls']
166 debug "storing url #{url.url}"
168 debug "#{list.length} urls now"
170 @registry[m.target] = list
174 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
175 urls = URI.extract(escaped)
176 Thread.new { handle_urls(m, urls, params[:urls].length) }
180 return unless m.kind_of?(PrivMessage)
183 escaped = URI.escape(m.message, OUR_UNSAFE)
184 urls = URI.extract(escaped, ['http', 'https'])
185 return if urls.empty?
186 Thread.new { handle_urls(m, urls) }
189 def reply_urls(opts={})
192 channel = opts[:channel]
194 return unless list and max and m
195 list[0..(max-1)].each do |url|
196 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
197 if @bot.config['url.info_on_list']
199 get_title_for_url(url.url,
200 :nick => url.nick, :channel => channel) rescue nil
201 # If the url info was missing and we now have some, try to upgrade it
202 if channel and title and not url.info
203 ll = @registry[channel]
205 if el = ll.find { |u| u.url == url.url }
207 @registry[channel] = ll
210 disp << " --> #{title}" if title
212 m.reply disp, :overlong => :truncate
217 channel = params[:channel] ? params[:channel] : m.target
218 max = params[:limit].to_i
221 list = @registry[channel]
223 m.reply "no urls seen yet for channel #{channel}"
225 reply_urls :msg => m, :channel => channel, :list => list, :max => max
229 def search(m, params)
230 channel = params[:channel] ? params[:channel] : m.target
231 max = params[:limit].to_i
232 string = params[:string]
235 regex = Regexp.new(string, Regexp::IGNORECASE)
236 list = @registry[channel].find_all {|url|
237 regex.match(url.url) || regex.match(url.nick) ||
238 (@bot.config['url.info_on_list'] && regex.match(url.info))
241 m.reply "no matches for channel #{channel}"
243 reply_urls :msg => m, :channel => channel, :list => list, :max => max
248 plugin = UrlPlugin.new
249 plugin.map 'urls info *urls', :action => 'info'
250 plugin.map 'url info *urls', :action => 'info'
251 plugin.map 'urls search :channel :limit :string', :action => 'search',
252 :defaults => {:limit => 4},
253 :requirements => {:limit => /^\d+$/},
255 plugin.map 'urls search :limit :string', :action => 'search',
256 :defaults => {:limit => 4},
257 :requirements => {:limit => /^\d+$/},
259 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
260 :requirements => {:limit => /^\d+$/},
262 plugin.map 'urls :limit', :defaults => {:limit => 4},
263 :requirements => {:limit => /^\d+$/},