6 define_structure :Url, :channel, :nick, :time, :url, :info
8 class ::UrlLinkError < RuntimeError
11 class UrlPlugin < Plugin
12 TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
13 LINK_INFO = "[Link Info]"
14 OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
16 Config.register Config::IntegerValue.new('url.max_urls',
17 :default => 100, :validate => Proc.new{|v| v > 0},
18 :desc => "Maximum number of urls to store. New urls replace oldest ones.")
19 Config.register Config::IntegerValue.new('url.display_link_info',
21 :desc => "Get the title of links pasted to the channel and display it (also tells if the link is broken or the site is down). Do it for at most this many links per line (set to 0 to disable)")
22 Config.register Config::BooleanValue.new('url.titles_only',
24 :desc => "Only show info for links that have <title> tags (in other words, don't display info for jpegs, mpegs, etc.)")
25 Config.register Config::BooleanValue.new('url.first_par',
27 :desc => "Also try to get the first paragraph of a web page")
28 Config.register Config::BooleanValue.new('url.info_on_list',
30 :desc => "Show link info when listing/searching for urls")
31 Config.register Config::ArrayValue.new('url.no_info_hosts',
32 :default => ['localhost', '^192\.168\.', '^10\.', '^127\.', '^172\.(1[6-9]|2\d|31)\.'],
33 :on_change => Proc.new { |bot, v| bot.plugins['url'].reset_no_info_hosts },
34 :desc => "A list of regular expressions matching hosts for which no info should be provided")
39 @registry.set_default(Array.new)
40 unless @bot.config['url.display_link_info'].kind_of?(Integer)
41 @bot.config.items[:'url.display_link_info'].set_string(@bot.config['url.display_link_info'].to_s)
46 def reset_no_info_hosts
47 @no_info_hosts = Regexp.new(@bot.config['url.no_info_hosts'].join('|'), true)
48 debug "no info hosts regexp set to #{@no_info_hosts}"
51 def help(plugin, topic="")
52 "url info <url> => display link info for <url> (set url.display_link_info > 0 if you want the bot to do it automatically when someone writes an url), urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>"
55 def get_title_from_html(pagedata)
56 return unless TITLE_RE.match(pagedata)
60 def get_title_for_url(uri_str, opts = {})
62 url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str)
63 return if url.scheme !~ /https?/
65 if url.host =~ @no_info_hosts
66 return "Sorry, info retrieval for #{url.host} is disabled"
75 debug "+ getting #{url.request_uri}"
76 @bot.httputil.get_response(url) { |resp|
82 if resp['content-type'] =~ /^text\/|(?:x|ht)ml/
83 # The page is text or HTML, so we can try finding a title and, if
84 # requested, the first par.
86 # We act differently depending on whether we want the first par or
87 # not: in the first case we download the initial part and the parse
88 # it; in the second case we only download as much as we need to find
91 if @bot.config['url.first_par']
92 partial = resp.partial_body(@bot.config['http.info_bytes'])
93 logopts[:title] = title = get_title_from_html(partial)
94 if url.fragment and not url.fragment.empty?
95 fragreg = /.*?<a\s+[^>]*name=["']?#{url.fragment}["']?.*?>/im
96 partial.sub!(fragreg,'')
98 first_par = Utils.ircify_first_html_par(partial, :strip => title)
99 unless first_par.empty?
100 logopts[:extra] = first_par
101 extra << ", #{Bold}text#{Bold}: #{first_par}"
103 call_event(:url_added, url.to_s, logopts)
104 return "#{Bold}title#{Bold}: #{title}#{extra}" if title
106 resp.partial_body(@bot.config['http.info_bytes']) { |part|
107 logopts[:title] = title = get_title_from_html(part)
108 call_event(:url_added, url.to_s, logopts)
109 return "#{Bold}title#{Bold}: #{title}" if title
112 # if nothing was found, provide more basic info, as for non-html pages
117 enc = resp['content-encoding']
118 logopts[:extra] = String.new
119 logopts[:extra] << "Content Type: #{resp['content-type']}"
121 logopts[:extra] << ", encoding: #{enc}"
122 extra << ", #{Bold}encoding#{Bold}: #{enc}"
125 unless @bot.config['url.titles_only']
126 # content doesn't have title, just display info.
127 size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil
129 logopts[:extra] << ", size: #{size} bytes"
130 size = ", #{Bold}size#{Bold}: #{size} bytes"
132 call_event(:url_added, url.to_s, logopts)
133 return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}"
135 call_event(:url_added, url.to_s, logopts)
137 raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
141 rescue Exception => e
147 raise "connecting to site/processing information (#{e.message})"
152 def handle_urls(m, urls, display_info=@bot.config['url.display_link_info'])
153 return if urls.empty?
154 debug "found urls #{urls.inspect}"
155 list = m.public? ? @registry[m.target] : nil
156 debug "display link info: #{display_info}"
158 urls.each do |urlstr|
159 debug "working on #{urlstr}"
160 next unless urlstr =~ /^https?:/
162 debug "Getting title for #{urlstr}..."
164 title = get_title_for_url(urlstr,
165 :nick => m.source.nick,
166 :channel => m.channel,
167 :ircline => m.message)
168 debug "Title #{title ? '' : 'not '} found"
170 m.reply "Error #{e.message}"
173 if display_info > urls_displayed
175 m.reply("#{LINK_INFO} #{title}", :overlong => :truncate)
182 # check to see if this url is already listed
183 next if list.find {|u| u.url == urlstr }
185 url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title)
186 debug "#{list.length} urls so far"
187 list.pop if list.length > @bot.config['url.max_urls']
188 debug "storing url #{url.url}"
190 debug "#{list.length} urls now"
192 @registry[m.target] = list
196 escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE)
197 urls = URI.extract(escaped)
198 Thread.new { handle_urls(m, urls, params[:urls].length) }
202 return unless m.kind_of?(PrivMessage)
205 escaped = URI.escape(m.message, OUR_UNSAFE)
206 urls = URI.extract(escaped)
207 Thread.new { handle_urls(m, urls) }
210 def reply_urls(opts={})
213 channel = opts[:channel]
215 return unless list and max and m
216 list[0..(max-1)].each do |url|
217 disp = "[#{url.time.strftime('%Y/%m/%d %H:%M:%S')}] <#{url.nick}> #{url.url}"
218 if @bot.config['url.info_on_list']
220 get_title_for_url(url.url,
221 :nick => url.nick, :channel => channel) rescue nil
222 # If the url info was missing and we now have some, try to upgrade it
223 if channel and title and not url.info
224 ll = @registry[channel]
226 if el = ll.find { |u| u.url == url.url }
228 @registry[channel] = ll
231 disp << " --> #{title}" if title
233 m.reply disp, :overlong => :truncate
238 channel = params[:channel] ? params[:channel] : m.target
239 max = params[:limit].to_i
242 list = @registry[channel]
244 m.reply "no urls seen yet for channel #{channel}"
246 reply_urls :msg => m, :channel => channel, :list => list, :max => max
250 def search(m, params)
251 channel = params[:channel] ? params[:channel] : m.target
252 max = params[:limit].to_i
253 string = params[:string]
256 regex = Regexp.new(string, Regexp::IGNORECASE)
257 list = @registry[channel].find_all {|url|
258 regex.match(url.url) || regex.match(url.nick) ||
259 (@bot.config['url.info_on_list'] && regex.match(url.info))
262 m.reply "no matches for channel #{channel}"
264 reply_urls :msg => m, :channel => channel, :list => list, :max => max
269 plugin = UrlPlugin.new
270 plugin.map 'urls info *urls', :action => 'info'
271 plugin.map 'url info *urls', :action => 'info'
272 plugin.map 'urls search :channel :limit :string', :action => 'search',
273 :defaults => {:limit => 4},
274 :requirements => {:limit => /^\d+$/},
276 plugin.map 'urls search :limit :string', :action => 'search',
277 :defaults => {:limit => 4},
278 :requirements => {:limit => /^\d+$/},
280 plugin.map 'urls :channel :limit', :defaults => {:limit => 4},
281 :requirements => {:limit => /^\d+$/},
283 plugin.map 'urls :limit', :defaults => {:limit => 4},
284 :requirements => {:limit => /^\d+$/},