6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
35 @registry.set_default(Array.new)
36 unless @bot.config['url.display_link_info'].kind_of?(Integer)
37 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
42 def reset_no_info_hosts
43 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
44 debug "no info hosts regexp set to #{@no_info_hosts}"
47 def help(plugin, topic="")
48 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
51 def get_title_from_html(pagedata)
52 return pagedata.ircify_html_title
55 def get_title_for_url(uri_str, opts = {})
57 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
58 return if url.scheme !~ /https?/
60 # also check the ip, the canonical name and the aliases
62 checks = TCPSocket.gethostbyname(url.host)
65 return "Unable to retrieve info for #{url.host}: #{e.message}"
71 unless checks.grep(@no_info_hosts).empty?
72 return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled"
81 debug "+ getting info for #{url.request_uri}"
82 info = @bot.filter(:htmlinfo, url)
86 logopts[:title] = title = info[:title]
89 logopts[:extra] = info[:content]
90 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
92 logopts[:extra] = String.new
93 logopts[:extra] << "Content Type: #{resp['content-type']}"
94 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
95 if enc = resp['content-encoding']
96 logopts[:extra] << ", encoding: #{enc}"
97 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
100 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
102 logopts[:extra] << ", size: #{size} bytes"
103 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
106 rescue Exception => e
112 raise "connecting to site/processing information (#{e.message})"
116 call_event(:url_added, url.to_s, logopts)
118 extra.unshift("#{Bold}title#{Bold}: #{title}")
120 return extra.join(", ") if title or not @bot.config['url.titles_only']
123 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
124 return if urls.empty?
125 debug "found urls #{urls.inspect}"
126 list = m.public? ? @registry[m.target] : nil
127 debug "display link info: #{display_info}"
129 urls.each do |urlstr|
130 debug "working on #{urlstr}"
131 next unless urlstr =~ /^https?:/
133 debug "Getting title for #{urlstr}..."
136 title = get_title_for_url(urlstr,
137 :nick => m.source.nick,
138 :channel => m.channel,
139 :ircline => m.message)
140 debug "Title #{title ? '' : 'not '} found"
141 reply = "#{LINK_INFO} #{title}" if title
144 # we might get a 404 because of trailing punctuation, so we try again
145 # with the last character stripped. this might generate invalid URIs
146 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
147 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
148 # chop off last character, and retry if we still have enough string to
149 # look like a minimal URL
150 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
152 reply = "Error #{e.message}"
155 if display_info > urls_displayed
157 m.plainreply(reply, :overlong => :truncate)
164 # check to see if this url is already listed
165 next if list.find {|u| u.url == urlstr }
167 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
168 debug "#{list.length} urls so far"
169 list.pop if list.length > @bot.config['url.max_urls']
170 debug "storing url #{url.url}"
172 debug "#{list.length} urls now"
174 @registry[m.target] = list
178 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
179 urls = URI.extract(escaped)
180 Thread.new { handle_urls(m, urls, params[:urls].length) }
186 escaped = URI.escape(m.message, OUR_UNSAFE)
187 urls = URI.extract(escaped, ['http', 'https'])
188 return if urls.empty?
189 Thread.new { handle_urls(m, urls) }
192 def reply_urls(opts={})
195 channel = opts[:channel]
197 return unless list and max and m
198 list[0..(max-1)].each do |url|
199 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
200 if @bot.config['url.info_on_list']
202 get_title_for_url(url.url,
203 :nick => url.nick, :channel => channel) rescue nil
204 # If the url info was missing and we now have some, try to upgrade it
205 if channel and title and not url.info
206 ll = @registry[channel]
208 if el = ll.find { |u| u.url == url.url }
210 @registry[channel] = ll
213 disp << " --> #{title}" if title
215 m.reply disp, :overlong => :truncate
220 channel = params[:channel] ? params[:channel] : m.target
221 max = params[:limit].to_i
224 list = @registry[channel]
226 m.reply "no urls seen yet for channel #{channel}"
228 reply_urls :msg => m, :channel => channel, :list => list, :max => max
232 def search(m, params)
233 channel = params[:channel] ? params[:channel] : m.target
234 max = params[:limit].to_i
235 string = params[:string]
238 regex = Regexp.new(string, Regexp::IGNORECASE)
239 list = @registry[channel].find_all {|url|
240 regex.match(url.url) || regex.match(url.nick) ||
241 (@bot.config['url.info_on_list'] && regex.match(url.info))
244 m.reply "no matches for channel #{channel}"
246 reply_urls :msg => m, :channel => channel, :list => list, :max => max
251 plugin = UrlPlugin.new
252 plugin.map 'urls info *urls', :action => 'info'
253 plugin.map 'url info *urls', :action => 'info'
254 plugin.map 'urls search :channel :limit :string', :action => 'search',
255 :defaults => {:limit => 4},
256 :requirements => {:limit => /^\d+$/},
258 plugin.map 'urls search :limit :string', :action => 'search',
259 :defaults => {:limit => 4},
260 :requirements => {:limit => /^\d+$/},
262 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
263 :requirements => {:limit => /^\d+$/},
265 plugin.map 'urls :limit', :defaults => {:limit => 4},
266 :requirements => {:limit => /^\d+$/},