6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
35 @registry.set_default(Array.new)
36 unless @bot.config['url.display_link_info'].kind_of?(Integer)
37 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
42 def reset_no_info_hosts
43 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
44 debug "no info hosts regexp set to #{@no_info_hosts}"
47 def help(plugin, topic="")
48 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
51 def get_title_from_html(pagedata)
52 return pagedata.ircify_html_title
55 def get_title_for_url(uri_str, opts = {})
57 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
58 return if url.scheme !~ /https?/
60 # also check the ip, the canonical name and the aliases
62 checks = TCPSocket.gethostbyname(url.host)
65 return "Unable to retrieve info for #{url.host}: #{e.message}"
71 unless checks.grep(@no_info_hosts).empty?
72 return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
81 debug "+ getting info for #{url.request_uri}"
82 info = @bot.filter(:htmlinfo, url)
86 logopts[:title] = title = info[:title]
89 logopts[:extra] = info[:content]
90 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
92 logopts[:extra] = String.new
93 logopts[:extra] << "Content Type: #{resp['content-type']}"
94 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
95 if enc = resp['content-encoding']
96 logopts[:extra] << ", encoding: #{enc}"
97 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
100 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
102 logopts[:extra] << ", size: #{size} bytes"
103 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
106 rescue Exception => e
112 raise "connecting to site/processing information (#{e.message})"
116 call_event(:url_added, url.to_s, logopts)
118 extra.unshift("#{Bold}title#{Bold}: #{title}")
120 return extra.join(", ") if title or not @bot.config['url.titles_only']
123 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
124 return if urls.empty?
125 debug "found urls #{urls.inspect}"
126 list = m.public? ? @registry[m.target] : nil
127 debug "display link info: #{display_info}"
129 urls.each do |urlstr|
130 debug "working on #{urlstr}"
131 next unless urlstr =~ /^https?:/
133 debug "Getting title for #{urlstr}..."
136 title = get_title_for_url(urlstr,
137 :nick => m.source.nick,
138 :channel => m.channel,
139 :ircline => m.message)
140 debug "Title #{title ? '' : 'not '} found"
141 reply = "#{LINK_INFO} #{title}" if title
144 # we might get a 404 because of trailing punctuation, so we try again
145 # with the last character stripped. this might generate invalid URIs
146 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
147 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
148 # chop off last character, and retry if we still have enough string to
149 # look like a minimal URL
150 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
152 reply = "Error #{e.message}"
155 if display_info > urls_displayed
157 m.plainreply(reply, :overlong => :truncate)
164 # check to see if this url is already listed
165 next if list.find {|u| u.url == urlstr }
167 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
168 debug "#{list.length} urls so far"
169 list.pop if list.length > @bot.config['url.max_urls']
170 debug "storing url #{url.url}"
172 debug "#{list.length} urls now"
174 @registry[m.target] = list
178 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
179 urls = URI.extract(escaped)
180 Thread.new { handle_urls(m, urls, params[:urls].length) }
184 return unless m.kind_of?(PrivMessage)
187 escaped = URI.escape(m.message, OUR_UNSAFE)
188 urls = URI.extract(escaped, ['http', 'https'])
189 return if urls.empty?
190 Thread.new { handle_urls(m, urls) }
193 def reply_urls(opts={})
196 channel = opts[:channel]
198 return unless list and max and m
199 list[0..(max-1)].each do |url|
200 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
201 if @bot.config['url.info_on_list']
203 get_title_for_url(url.url,
204 :nick => url.nick, :channel => channel) rescue nil
205 # If the url info was missing and we now have some, try to upgrade it
206 if channel and title and not url.info
207 ll = @registry[channel]
209 if el = ll.find { |u| u.url == url.url }
211 @registry[channel] = ll
214 disp << " --> #{title}" if title
216 m.reply disp, :overlong => :truncate
221 channel = params[:channel] ? params[:channel] : m.target
222 max = params[:limit].to_i
225 list = @registry[channel]
227 m.reply "no urls seen yet for channel #{channel}"
229 reply_urls :msg => m, :channel => channel, :list => list, :max => max
233 def search(m, params)
234 channel = params[:channel] ? params[:channel] : m.target
235 max = params[:limit].to_i
236 string = params[:string]
239 regex = Regexp.new(string, Regexp::IGNORECASE)
240 list = @registry[channel].find_all {|url|
241 regex.match(url.url) || regex.match(url.nick) ||
242 (@bot.config['url.info_on_list'] && regex.match(url.info))
245 m.reply "no matches for channel #{channel}"
247 reply_urls :msg => m, :channel => channel, :list => list, :max => max
252 plugin = UrlPlugin.new
253 plugin.map 'urls info *urls', :action => 'info'
254 plugin.map 'url info *urls', :action => 'info'
255 plugin.map 'urls search :channel :limit :string', :action => 'search',
256 :defaults => {:limit => 4},
257 :requirements => {:limit => /^\d+$/},
259 plugin.map 'urls search :limit :string', :action => 'search',
260 :defaults => {:limit => 4},
261 :requirements => {:limit => /^\d+$/},
263 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
264 :requirements => {:limit => /^\d+$/},
266 plugin.map 'urls :limit', :defaults => {:limit => 4},
267 :requirements => {:limit => /^\d+$/},