6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class ::UrlLinkError < RuntimeError
11 class UrlPlugin < Plugin
12 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
13 LINK_INFO = "[Link Info]"
14 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
16 Config.register Config::IntegerValue.new('url.max_urls',
17 :default => 100, :validate => Proc.new{|v| v > 0},
18 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
19 Config.register Config::IntegerValue.new('url.display_link_info',
21 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
22 Config.register Config::BooleanValue.new('url.titles_only',
24 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
25 Config.register Config::BooleanValue.new('url.first_par',
27 :desc => "Also try to get the first paragraph of a web page")
28 Config.register Config::BooleanValue.new('url.info_on_list',
30 :desc => "Show link info when listing/searching for urls")
31 Config.register Config::ArrayValue.new('url.no_info_hosts',
32 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
33 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
34 :desc => "A list of regular expressions matching hosts for which no info should be provided")
39 @registry.set_default(Array.new)
40 unless @bot.config['url.display_link_info'].kind_of?(Integer)
41 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
46 def reset_no_info_hosts
47 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
48 debug "no info hosts regexp set to #{@no_info_hosts}"
51 def help(plugin, topic="")
52 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
55 def get_title_from_html(pagedata)
56 return unless TITLE_RE.match(pagedata)
60 def get_title_for_url(uri_str, opts = {})
62 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
63 return if url.scheme !~ /https?/
65 if url.host =~ @no_info_hosts
66 return "Sorry, info retrieval for #{url.host} is disabled"
75 debug "+ getting #{url.request_uri}"
76 @bot.httputil.get_response(url) { |resp|
82 if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
83 # The page is text or HTML, so we can try finding a title and, if
84 # requested, the first par.
86 # We act differently depending on whether we want the first par or
87 # not: in the first case we download the initial part and the parse
88 # it; in the second case we only download as much as we need to find
91 if @bot.config['url.first_par']
92 partial = resp.partial_body(@bot.config['http.info_bytes'])
93 logopts[:title] = title = get_title_from_html(partial)
94 if url.fragment and not url.fragment.empty?
95 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
96 partial.sub!(fragreg,'')
98 first_par = Utils.ircify_first_html_par(partial, :strip => title)
99 unless first_par.empty?
100 logopts[:extra] = first_par
101 extra << ", #{Bold}text#{Bold}: #{first_par}"
103 call_event(:url_added, url.to_s, logopts)
104 return "#{Bold}title#{Bold}: #{title}#{extra}" if title
106 resp.partial_body(@bot.config['http.info_bytes']) { |part|
107 logopts[:title] = title = get_title_from_html(part)
108 call_event(:url_added, url.to_s, logopts)
109 return "#{Bold}title#{Bold}: #{title}" if title
112 # if nothing was found, provide more basic info, as for non-html pages
117 enc = resp['content-encoding']
118 logopts[:extra] = String.new
119 logopts[:extra] << "Content Type: #{resp['content-type']}"
121 logopts[:extra] << ", encoding: #{enc}"
122 extra << ", #{Bold}encoding#{Bold}: #{enc}"
125 unless @bot.config['url.titles_only']
126 # content doesn't have title, just display info.
127 size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
129 logopts[:extra] << ", size: #{size} bytes"
130 size = ", #{Bold}size#{Bold}: #{size} bytes"
132 call_event(:url_added, url.to_s, logopts)
133 return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
135 call_event(:url_added, url.to_s, logopts)
137 raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
141 rescue Exception => e
147 raise "connecting to site/processing information (#{e.message})"
152 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
153 return if urls.empty?
154 debug "found urls #{urls.inspect}"
156 list = @registry[m.target]
162 debug "working on #{urlstr}"
163 next unless urlstr =~ /^https?:/
165 debug "display link info: #{display_info}"
166 if display_info > urls_displayed
169 debug "Getting title for #{urlstr}..."
171 title = get_title_for_url(urlstr,
172 :nick => m.source.nick,
173 :channel => m.channel,
174 :ircline => m.message)
176 m.reply "#{LINK_INFO} #{title}", :overlong => :truncate
179 debug "Title not found!"
182 m.reply "Error #{e.message}"
189 # check to see if this url is already listed
190 next if list.find {|u| u.url == urlstr }
192 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
193 debug "#{list.length} urls so far"
194 if list.length > @bot.config['url.max_urls']
197 debug "storing url #{url.url}"
199 debug "#{list.length} urls now"
201 @registry[m.target] = list
205 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
206 urls = URI.extract(escaped)
207 handle_urls(m, urls, params[:urls].length)
211 return unless m.kind_of?(PrivMessage)
214 escaped = URI.escape(m.message, OUR_UNSAFE)
215 urls = URI.extract(escaped)
219 def reply_urls(opts={})
222 channel = opts[:channel]
224 return unless list and max and m
225 list[0..(max-1)].each do |url|
226 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
227 if @bot.config['url.info_on_list']
229 get_title_for_url(url.url,
230 :nick => url.nick, :channel => channel) rescue nil
231 # If the url info was missing and we now have some, try to upgrade it
232 if channel and title and not url.info
233 ll = @registry[channel]
235 if el = ll.find { |u| u.url == url.url }
237 @registry[channel] = ll
240 disp << " --> #{title}" if title
242 m.reply disp, :overlong => :truncate
247 channel = params[:channel] ? params[:channel] : m.target
248 max = params[:limit].to_i
251 list = @registry[channel]
253 m.reply "no urls seen yet for channel #{channel}"
255 reply_urls :msg => m, :channel => channel, :list => list, :max => max
259 def search(m, params)
260 channel = params[:channel] ? params[:channel] : m.target
261 max = params[:limit].to_i
262 string = params[:string]
265 regex = Regexp.new(string, Regexp::IGNORECASE)
266 list = @registry[channel].find_all {|url|
267 regex.match(url.url) || regex.match(url.nick) ||
268 (@bot.config['url.info_on_list'] && regex.match(url.info))
271 m.reply "no matches for channel #{channel}"
273 reply_urls :msg => m, :channel => channel, :list => list, :max => max
278 plugin = UrlPlugin.new
279 plugin.map 'urls info *urls', :action => 'info'
280 plugin.map 'url info *urls', :action => 'info'
281 plugin.map 'urls search :channel :limit :string', :action => 'search',
282 :defaults => {:limit => 4},
283 :requirements => {:limit => /^\d+$/},
285 plugin.map 'urls search :limit :string', :action => 'search',
286 :defaults => {:limit => 4},
287 :requirements => {:limit => /^\d+$/},
289 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
290 :requirements => {:limit => /^\d+$/},
292 plugin.map 'urls :limit', :defaults => {:limit => 4},
293 :requirements => {:limit => /^\d+$/},