6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class UrlPlugin < Plugin
9 LINK_INFO = "[Link Info]"
10 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
12 Config.register Config::IntegerValue.new('url.max_urls',
13 :default => 100, :validate => Proc.new{|v| v > 0},
14 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
15 Config.register Config::IntegerValue.new('url.display_link_info',
17 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
18 Config.register Config::BooleanValue.new('url.titles_only',
20 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
21 Config.register Config::BooleanValue.new('url.first_par',
23 :desc => "Also try to get the first paragraph of a web page")
24 Config.register Config::BooleanValue.new('url.info_on_list',
26 :desc => "Show link info when listing/searching for urls")
27 Config.register Config::ArrayValue.new('url.no_info_hosts',
28 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
29 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
30 :desc => "A list of regular expressions matching hosts for which no info should be provided")
31 Config.register Config::ArrayValue.new('url.only_on_channels',
32 :desc => "Show link info only on these channels",
34 Config.register Config::ArrayValue.new('url.ignore',
35 :desc => "Don't show link info for urls from users represented as hostmasks on this list. Useful for ignoring other bots, for example.",
40 @registry.set_default(Array.new)
41 unless @bot.config['url.display_link_info'].kind_of?(Integer)
42 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
47 def reset_no_info_hosts
48 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
49 debug "no info hosts regexp set to #{@no_info_hosts}"
52 def help(plugin, topic="")
53 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
56 def get_title_from_html(pagedata)
57 return pagedata.ircify_html_title
60 def get_title_for_url(uri_str, opts = {})
62 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
63 return if url.scheme !~ /https?/
65 # also check the ip, the canonical name and the aliases
67 checks = TCPSocket.gethostbyname(url.host)
70 return "Unable to retrieve info for #{url.host}: #{e.message}"
76 unless checks.grep(@no_info_hosts).empty?
77 return ( opts[:always_reply] ? "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" : false )
86 debug "+ getting info for #{url.request_uri}"
87 info = @bot.filter(:htmlinfo, url)
91 logopts[:title] = title = info[:title]
94 logopts[:extra] = info[:content]
95 extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par']
97 logopts[:extra] = String.new
98 logopts[:extra] << "Content Type: #{resp['content-type']}"
99 extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title
100 if enc = resp['content-encoding']
101 logopts[:extra] << ", encoding: #{enc}"
102 extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title
105 size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
107 logopts[:extra] << ", size: #{size} bytes"
108 extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title
111 rescue Exception => e
117 raise "connecting to site/processing information (#{e.message})"
121 call_event(:url_added, url.to_s, logopts)
123 extra.unshift("#{Bold}title#{Bold}: #{title}")
125 return extra.join(", ") if title or not @bot.config['url.titles_only']
128 def handle_urls(m, params={})
130 :display_info => @bot.config['url.display_link_info'],
131 :channels => @bot.config['url.only_on_channels'],
132 :ignore => @bot.config['url.ignore']
135 display_info= opts[:display_info]
136 channels = opts[:channels]
137 ignore = opts[:ignore]
139 unless channels.empty?
140 return unless channels.map { |c| c.downcase }.include?(m.channel.downcase)
143 ignore.each { |u| return if m.source.matches?(u) }
145 return if urls.empty?
146 debug "found urls #{urls.inspect}"
147 list = m.public? ? @registry[m.target] : nil
148 debug "display link info: #{display_info}"
150 urls.each do |urlstr|
151 debug "working on #{urlstr}"
152 next unless urlstr =~ /^https?:\/\/./
154 debug "Getting title for #{urlstr}..."
157 title = get_title_for_url(urlstr,
158 :always_reply => m.address?,
159 :nick => m.source.nick,
160 :channel => m.channel,
161 :ircline => m.message)
162 debug "Title #{title ? '' : 'not '} found"
163 reply = "#{LINK_INFO} #{title}" if title
166 # we might get a 404 because of trailing punctuation, so we try again
167 # with the last character stripped. this might generate invalid URIs
168 # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
169 if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
170 # chop off last character, and retry if we still have enough string to
171 # look like a minimal URL
172 retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
174 reply = "Error #{e.message}"
177 if display_info > urls_displayed
179 m.reply reply, :overlong => :truncate, :to => :public,
180 :nick => (m.address? ? :auto : false)
187 # check to see if this url is already listed
188 next if list.find {|u| u.url == urlstr }
190 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
191 debug "#{list.length} urls so far"
192 list.pop if list.length > @bot.config['url.max_urls']
193 debug "storing url #{url.url}"
195 debug "#{list.length} urls now"
197 @registry[m.target] = list
201 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
202 urls = URI.extract(escaped)
206 :display_info => params[:urls].length,
214 escaped = URI.escape(m.message, OUR_UNSAFE)
215 urls = URI.extract(escaped, ['http', 'https'])
216 return if urls.empty?
217 Thread.new { handle_urls(m, :urls => urls) }
220 def reply_urls(opts={})
223 channel = opts[:channel]
225 return unless list and max and m
226 list[0..(max-1)].each do |url|
227 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
228 if @bot.config['url.info_on_list']
230 get_title_for_url(url.url,
231 :nick => url.nick, :channel => channel) rescue nil
232 # If the url info was missing and we now have some, try to upgrade it
233 if channel and title and not url.info
234 ll = @registry[channel]
236 if el = ll.find { |u| u.url == url.url }
238 @registry[channel] = ll
241 disp << " --> #{title}" if title
243 m.reply disp, :overlong => :truncate
248 channel = params[:channel] ? params[:channel] : m.target
249 max = params[:limit].to_i
252 list = @registry[channel]
254 m.reply "no urls seen yet for channel #{channel}"
256 reply_urls :msg => m, :channel => channel, :list => list, :max => max
260 def search(m, params)
261 channel = params[:channel] ? params[:channel] : m.target
262 max = params[:limit].to_i
263 string = params[:string]
266 regex = Regexp.new(string, Regexp::IGNORECASE)
267 list = @registry[channel].find_all {|url|
268 regex.match(url.url) || regex.match(url.nick) ||
269 (@bot.config['url.info_on_list'] && regex.match(url.info))
272 m.reply "no matches for channel #{channel}"
274 reply_urls :msg => m, :channel => channel, :list => list, :max => max
279 plugin = UrlPlugin.new
280 plugin.map 'urls info *urls', :action => 'info'
281 plugin.map 'url info *urls', :action => 'info'
282 plugin.map 'urls search :channel :limit :string', :action => 'search',
283 :defaults => {:limit => 4},
284 :requirements => {:limit => /^\d+$/},
286 plugin.map 'urls search :limit :string', :action => 'search',
287 :defaults => {:limit => 4},
288 :requirements => {:limit => /^\d+$/},
290 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
291 :requirements => {:limit => /^\d+$/},
293 plugin.map 'urls :limit', :defaults => {:limit => 4},
294 :requirements => {:limit => /^\d+$/},