6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class ::UrlLinkError < RuntimeError
11 class UrlPlugin < Plugin
12 LINK_INFO = "[Link Info]"
13 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
15 Config.register Config::IntegerValue.new('url.max_urls',
16 :default => 100, :validate => Proc.new{|v| v > 0},
17 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
18 Config.register Config::IntegerValue.new('url.display_link_info',
20 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
21 Config.register Config::BooleanValue.new('url.titles_only',
23 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
24 Config.register Config::BooleanValue.new('url.first_par',
26 :desc => "Also try to get the first paragraph of a web page")
27 Config.register Config::BooleanValue.new('url.info_on_list',
29 :desc => "Show link info when listing/searching for urls")
30 Config.register Config::ArrayValue.new('url.no_info_hosts',
31 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
32 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
33 :desc => "A list of regular expressions matching hosts for which no info should be provided")
38 @registry.set_default(Array.new)
39 unless @bot.config['url.display_link_info'].kind_of?(Integer)
40 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
45 def reset_no_info_hosts
46 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
47 debug "no info hosts regexp set to #{@no_info_hosts}"
50 def help(plugin, topic="")
51 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
54 def get_title_from_html(pagedata)
55 return pagedata.ircify_html_title
58 def get_title_for_url(uri_str, opts = {})
60 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
61 return if url.scheme !~ /https?/
63 if url.host =~ @no_info_hosts
64 return "Sorry, info retrieval for #{url.host} is disabled"
73 debug "+ getting #{url.request_uri}"
74 @bot.httputil.get_response(url) { |resp|
80 if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
81 # The page is text or HTML, so we can try finding a title and, if
82 # requested, the first par.
84 # We act differently depending on whether we want the first par or
85 # not: in the first case we download the initial part and the parse
86 # it; in the second case we only download as much as we need to find
89 if @bot.config['url.first_par']
90 partial = resp.partial_body(@bot.config['http.info_bytes'])
91 logopts[:title] = title = get_title_from_html(partial)
92 if url.fragment and not url.fragment.empty?
93 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
94 partial.sub!(fragreg,'')
96 first_par = Utils.ircify_first_html_par(partial, :strip => title)
97 unless first_par.empty?
98 logopts[:extra] = first_par
99 extra << ", #{Bold}text#{Bold}: #{first_par}"
101 call_event(:url_added, url.to_s, logopts)
102 return "#{Bold}title#{Bold}: #{title}#{extra}" if title
104 resp.partial_body(@bot.config['http.info_bytes']) { |part|
105 logopts[:title] = title = get_title_from_html(part)
106 call_event(:url_added, url.to_s, logopts)
107 return "#{Bold}title#{Bold}: #{title}" if title
110 # if nothing was found, provide more basic info, as for non-html pages
115 enc = resp['content-encoding']
116 logopts[:extra] = String.new
117 logopts[:extra] << "Content Type: #{resp['content-type']}"
119 logopts[:extra] << ", encoding: #{enc}"
120 extra << ", #{Bold}encoding#{Bold}: #{enc}"
123 unless @bot.config['url.titles_only']
124 # content doesn't have title, just display info.
125 size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
127 logopts[:extra] << ", size: #{size} bytes"
128 size = ", #{Bold}size#{Bold}: #{size} bytes"
130 call_event(:url_added, url.to_s, logopts)
131 return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
133 call_event(:url_added, url.to_s, logopts)
135 raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
139 rescue Exception => e
145 raise "connecting to site/processing information (#{e.message})"
150 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
151 return if urls.empty?
152 debug "found urls #{urls.inspect}"
153 list = m.public? ? @registry[m.target] : nil
154 debug "display link info: #{display_info}"
156 urls.each do |urlstr|
157 debug "working on #{urlstr}"
158 next unless urlstr =~ /^https?:/
160 debug "Getting title for #{urlstr}..."
162 title = get_title_for_url(urlstr,
163 :nick => m.source.nick,
164 :channel => m.channel,
165 :ircline => m.message)
166 debug "Title #{title ? '' : 'not '} found"
168 m.reply "Error #{e.message}"
171 if display_info > urls_displayed
173 m.reply("#{LINK_INFO} #{title}", :overlong => :truncate)
180 # check to see if this url is already listed
181 next if list.find {|u| u.url == urlstr }
183 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
184 debug "#{list.length} urls so far"
185 list.pop if list.length > @bot.config['url.max_urls']
186 debug "storing url #{url.url}"
188 debug "#{list.length} urls now"
190 @registry[m.target] = list
194 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
195 urls = URI.extract(escaped)
196 Thread.new { handle_urls(m, urls, params[:urls].length) }
200 return unless m.kind_of?(PrivMessage)
203 escaped = URI.escape(m.message, OUR_UNSAFE)
204 urls = URI.extract(escaped)
205 Thread.new { handle_urls(m, urls) }
208 def reply_urls(opts={})
211 channel = opts[:channel]
213 return unless list and max and m
214 list[0..(max-1)].each do |url|
215 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
216 if @bot.config['url.info_on_list']
218 get_title_for_url(url.url,
219 :nick => url.nick, :channel => channel) rescue nil
220 # If the url info was missing and we now have some, try to upgrade it
221 if channel and title and not url.info
222 ll = @registry[channel]
224 if el = ll.find { |u| u.url == url.url }
226 @registry[channel] = ll
229 disp << " --> #{title}" if title
231 m.reply disp, :overlong => :truncate
236 channel = params[:channel] ? params[:channel] : m.target
237 max = params[:limit].to_i
240 list = @registry[channel]
242 m.reply "no urls seen yet for channel #{channel}"
244 reply_urls :msg => m, :channel => channel, :list => list, :max => max
248 def search(m, params)
249 channel = params[:channel] ? params[:channel] : m.target
250 max = params[:limit].to_i
251 string = params[:string]
254 regex = Regexp.new(string, Regexp::IGNORECASE)
255 list = @registry[channel].find_all {|url|
256 regex.match(url.url) || regex.match(url.nick) ||
257 (@bot.config['url.info_on_list'] && regex.match(url.info))
260 m.reply "no matches for channel #{channel}"
262 reply_urls :msg => m, :channel => channel, :list => list, :max => max
267 plugin = UrlPlugin.new
268 plugin.map 'urls info *urls', :action => 'info'
269 plugin.map 'url info *urls', :action => 'info'
270 plugin.map 'urls search :channel :limit :string', :action => 'search',
271 :defaults => {:limit => 4},
272 :requirements => {:limit => /^\d+$/},
274 plugin.map 'urls search :limit :string', :action => 'search',
275 :defaults => {:limit => 4},
276 :requirements => {:limit => /^\d+$/},
278 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
279 :requirements => {:limit => /^\d+$/},
281 plugin.map 'urls :limit', :defaults => {:limit => 4},
282 :requirements => {:limit => /^\d+$/},