6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
31 Config.register Config::ArrayValue.new('url.only_on_channels',
32 :desc => "Show link info only on these channels",
34 Config.register Config::ArrayValue.new('url.ignore',
35 :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
40 @registry.set_default(Array.new)
41 unless @bot.config['url.display_link_info'].kind_of?(Integer)
42 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
45 self.filter_group = :htmlinfo
49 def reset_no_info_hosts
50 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
51 debug "no info hosts regexp set to #{@no_info_hosts}"
54 def help(plugin, topic="")
55 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
58 def get_title_from_html(pagedata)
59 return pagedata.ircify_html_title
62 def get_title_for_url(uri_str, opts = {})
64 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
65 return if url.scheme !~ /https?/
67 # also check the ip, the canonical name and the aliases
69 checks = TCPSocket.gethostbyname(url.host)
72 return "Unable to retrieve info for #{url.host}: #{e.message}"
78 unless checks.grep(@no_info_hosts).empty?
79 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
88 debug "+ getting info for #{url.request_uri}"
89 info = @bot.filter(:htmlinfo, url)
93 logopts[:title] = title = info[:title]
96 logopts[:extra] = info[:content]
97 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
99 logopts[:extra] = String.new
100 logopts[:extra] << "Content Type: #{resp['content-type']}"
101 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
102 if enc = resp['content-encoding']
103 logopts[:extra] << ", encoding: #{enc}"
104 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
107 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
109 logopts[:extra] << ", size: #{size} bytes"
110 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
113 rescue Exception => e
119 raise "connecting to site/processing information (#{e.message})"
123 call_event(:url_added, url.to_s, logopts)
125 extra.unshift("#{Bold}title#{Bold}: #{title}")
127 return extra.join(", ") if title or not @bot.config['url.titles_only']
130 def handle_urls(m, params={})
132 :display_info => @bot.config['url.display_link_info'],
133 :channels => @bot.config['url.only_on_channels'],
134 :ignore => @bot.config['url.ignore']
137 display_info= opts[:display_info]
138 channels = opts[:channels]
139 ignore = opts[:ignore]
141 unless channels.empty?
142 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
145 ignore.each { |u| return if m.source.matches?(u) }
147 return if urls.empty?
148 debug "found urls #{urls.inspect}"
149 list = m.public? ? @registry[m.target] : nil
150 debug "display link info: #{display_info}"
152 urls.each do |urlstr|
153 debug "working on #{urlstr}"
154 next unless urlstr =~ /^https?:\/\/./
156 debug "Getting title for #{urlstr}..."
159 title = get_title_for_url(urlstr,
160 :always_reply => m.address?,
161 :nick => m.source.nick,
162 :channel => m.channel,
163 :ircline => m.message)
164 debug "Title #{title ? '' : 'not '} found"
165 reply = "#{LINK_INFO} #{title}" if title
168 # we might get a 404 because of trailing punctuation, so we try again
169 # with the last character stripped. this might generate invalid URIs
170 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
171 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
172 # chop off last non-word character from the unescaped version of
173 # the URL, and retry if we still have enough string to look like a
175 unescaped = URI.unescape(urlstr)
176 debug "Unescaped: #{unescaped}"
177 if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
178 urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
181 debug "Not retrying #{unescaped}"
184 reply = "Error #{e.message}"
187 if display_info > urls_displayed
189 m.reply reply, :overlong => :truncate, :to => :public,
190 :nick => (m.address? ? :auto : false)
197 # check to see if this url is already listed
198 next if list.find {|u| u.url == urlstr }
200 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
201 debug "#{list.length} urls so far"
202 list.pop if list.length > @bot.config['url.max_urls']
203 debug "storing url #{url.url}"
205 debug "#{list.length} urls now"
207 @registry[m.target] = list
211 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
212 urls = URI.extract(escaped)
216 :display_info => params[:urls].length,
224 escaped = URI.escape(m.message, OUR_UNSAFE)
225 urls = URI.extract(escaped, ['http', 'https'])
226 return if urls.empty?
227 Thread.new { handle_urls(m, :urls => urls) }
230 def reply_urls(opts={})
233 channel = opts[:channel]
235 return unless list and max and m
236 list[0..(max-1)].each do |url|
237 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
238 if @bot.config['url.info_on_list']
240 get_title_for_url(url.url,
241 :nick => url.nick, :channel => channel) rescue nil
242 # If the url info was missing and we now have some, try to upgrade it
243 if channel and title and not url.info
244 ll = @registry[channel]
246 if el = ll.find { |u| u.url == url.url }
248 @registry[channel] = ll
251 disp << " --> #{title}" if title
253 m.reply disp, :overlong => :truncate
258 channel = params[:channel] ? params[:channel] : m.target
259 max = params[:limit].to_i
262 list = @registry[channel]
264 m.reply "no urls seen yet for channel #{channel}"
266 reply_urls :msg => m, :channel => channel, :list => list, :max => max
270 def search(m, params)
271 channel = params[:channel] ? params[:channel] : m.target
272 max = params[:limit].to_i
273 string = params[:string]
276 regex = Regexp.new(string, Regexp::IGNORECASE)
277 list = @registry[channel].find_all {|url|
278 regex.match(url.url) || regex.match(url.nick) ||
279 (@bot.config['url.info_on_list'] && regex.match(url.info))
282 m.reply "no matches for channel #{channel}"
284 reply_urls :msg => m, :channel => channel, :list => list, :max => max
289 plugin = UrlPlugin.new
290 plugin.map 'urls info *urls', :action => 'info'
291 plugin.map 'url info *urls', :action => 'info'
292 plugin.map 'urls search :channel :limit :string', :action => 'search',
293 :defaults => {:limit => 4},
294 :requirements => {:limit => /^\d+$/},
296 plugin.map 'urls search :limit :string', :action => 'search',
297 :defaults => {:limit => 4},
298 :requirements => {:limit => /^\d+$/},
300 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
301 :requirements => {:limit => /^\d+$/},
303 plugin.map 'urls :limit', :defaults => {:limit => 4},
304 :requirements => {:limit => /^\d+$/},