X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=data%2Frbot%2Fplugins%2Furl.rb;h=d08c24e837fea4fe2bb88b6829a7ba5e322fef8a;hb=1e841175468b3e0357ab278a226a237fe4d7687e;hp=296ece33bf71f799ac802b243e62e9f845187cce;hpb=87e7db2b40eb31b33f27e7c3bb7eeb256724718a;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 296ece33..d08c24e8 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -5,11 +5,7 @@ define_structure :Url, :channel, :nick, :time, :url, :info -class ::UrlLinkError < RuntimeError -end - class UrlPlugin < Plugin - TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im LINK_INFO = "[Link Info]" OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N') @@ -53,8 +49,7 @@ class UrlPlugin < Plugin end def get_title_from_html(pagedata) - return unless TITLE_RE.match(pagedata) - $1.ircify_html + return pagedata.ircify_html_title end def get_title_for_url(uri_str, opts = {}) @@ -62,82 +57,52 @@ class UrlPlugin < Plugin url = uri_str.kind_of?(URI) ? uri_str : URI.parse(uri_str) return if url.scheme !~ /https?/ - if url.host =~ @no_info_hosts - return "Sorry, info retrieval for #{url.host} is disabled" + # also check the ip, the canonical name and the aliases + begin + checks = TCPSocket.gethostbyname(url.host) + checks.delete_at(-2) + rescue => e + return "Unable to retrieve info for #{url.host}: #{e.message}" + end + + checks << url.host + checks.flatten! + + unless checks.grep(@no_info_hosts).empty? + return "Sorry, info retrieval for #{url.host} (#{checks.first}) is disabled" end logopts = opts.dup title = nil - extra = String.new + extra = [] begin - debug "+ getting #{url.request_uri}" - @bot.httputil.get_response(url) { |resp| - case resp - when Net::HTTPSuccess - - debug resp.to_hash - - if resp['content-type'] =~ /^text\/|(?:x|ht)ml/ - # The page is text or HTML, so we can try finding a title and, if - # requested, the first par. - # - # We act differently depending on whether we want the first par or - # not: in the first case we download the initial part and the parse - # it; in the second case we only download as much as we need to find - # the title - # - if @bot.config['url.first_par'] - partial = resp.partial_body(@bot.config['http.info_bytes']) - logopts[:title] = title = get_title_from_html(partial) - if url.fragment and not url.fragment.empty? - fragreg = /.*?]*name=["']?#{url.fragment}["']?.*?>/im - partial.sub!(fragreg,'') - end - first_par = Utils.ircify_first_html_par(partial, :strip => title) - unless first_par.empty? - logopts[:extra] = first_par - extra << ", #{Bold}text#{Bold}: #{first_par}" - end - call_event(:url_added, url.to_s, logopts) - return "#{Bold}title#{Bold}: #{title}#{extra}" if title - else - resp.partial_body(@bot.config['http.info_bytes']) { |part| - logopts[:title] = title = get_title_from_html(part) - call_event(:url_added, url.to_s, logopts) - return "#{Bold}title#{Bold}: #{title}" if title - } - end - # if nothing was found, provide more basic info, as for non-html pages - else - resp.no_cache = true - end + debug "+ getting info for #{url.request_uri}" + info = @bot.filter(:htmlinfo, url) + debug info + resp = info[:headers] - enc = resp['content-encoding'] - logopts[:extra] = String.new - logopts[:extra] << "Content Type: #{resp['content-type']}" - if enc - logopts[:extra] << ", encoding: #{enc}" - extra << ", #{Bold}encoding#{Bold}: #{enc}" - end + logopts[:title] = title = info[:title] - unless @bot.config['url.titles_only'] - # content doesn't have title, just display info. - size = resp['content-length'].gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil - if size - logopts[:extra] << ", size: #{size} bytes" - size = ", #{Bold}size#{Bold}: #{size} bytes" - end - call_event(:url_added, url.to_s, logopts) - return "#{Bold}type#{Bold}: #{resp['content-type']}#{size}#{extra}" - end - call_event(:url_added, url.to_s, logopts) - else - raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})" + if info[:content] + logopts[:extra] = info[:content] + extra << "#{Bold}text#{Bold}: #{info[:content]}" if @bot.config['url.first_par'] + else + logopts[:extra] = String.new + logopts[:extra] << "Content Type: #{resp['content-type']}" + extra << "#{Bold}type#{Bold}: #{resp['content-type']}" unless title + if enc = resp['content-encoding'] + logopts[:extra] << ", encoding: #{enc}" + extra << "#{Bold}encoding#{Bold}: #{enc}" if @bot.config['url.first_par'] or not title end - } - return nil + + size = resp['content-length'].first.gsub(/(\d)(?=\d{3}+(?:\.|$))(\d{3}\..*)?/,'\1,\2') rescue nil + if size + logopts[:extra] << ", size: #{size} bytes" + extra << "#{Bold}size#{Bold}: #{size} bytes" if @bot.config['url.first_par'] or not title + end + end rescue Exception => e case e when UrlLinkError @@ -147,40 +112,50 @@ class UrlPlugin < Plugin raise "connecting to site/processing information (#{e.message})" end end + + call_event(:url_added, url.to_s, logopts) + if title + extra.unshift("#{Bold}title#{Bold}: #{title}") + end + return extra.join(", ") if title or not @bot.config['url.titles_only'] end def handle_urls(m, urls, display_info=@bot.config['url.display_link_info']) return if urls.empty? debug "found urls #{urls.inspect}" - if m.public? - list = @registry[m.target] - else - list = nil - end + list = m.public? ? @registry[m.target] : nil + debug "display link info: #{display_info}" urls_displayed = 0 - urls.each { |urlstr| + urls.each do |urlstr| debug "working on #{urlstr}" next unless urlstr =~ /^https?:/ title = nil - debug "display link info: #{display_info}" + debug "Getting title for #{urlstr}..." + reply = nil + begin + title = get_title_for_url(urlstr, + :nick => m.source.nick, + :channel => m.channel, + :ircline => m.message) + debug "Title #{title ? '' : 'not '} found" + reply = "#{LINK_INFO} #{title}" if title + rescue => e + debug e + # we might get a 404 because of trailing punctuation, so we try again + # with the last character stripped. this might generate invalid URIs + # (e.g. because "some.url" gets chopped to some.url%2, so catch that too + if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError) + # chop off last character, and retry if we still have enough string to + # look like a minimal URL + retry if urlstr.chop! and urlstr =~ /^https?:\/\/./ + end + reply = "Error #{e.message}" + end + if display_info > urls_displayed - urls_displayed += 1 - Thread.start do - debug "Getting title for #{urlstr}..." - begin - title = get_title_for_url(urlstr, - :nick => m.source.nick, - :channel => m.channel, - :ircline => m.message) - if title - m.reply "#{LINK_INFO} #{title}", :overlong => :truncate - debug "Title found!" - else - debug "Title not found!" - end - rescue => e - m.reply "Error #{e.message}" - end + if reply + m.plainreply(reply, :overlong => :truncate) + urls_displayed += 1 end end @@ -191,20 +166,18 @@ class UrlPlugin < Plugin url = Url.new(m.target, m.sourcenick, Time.new, urlstr, title) debug "#{list.length} urls so far" - if list.length > @bot.config['url.max_urls'] - list.pop - end + list.pop if list.length > @bot.config['url.max_urls'] debug "storing url #{url.url}" list.unshift url debug "#{list.length} urls now" - } + end @registry[m.target] = list end def info(m, params) escaped = URI.escape(params[:urls].to_s, OUR_UNSAFE) urls = URI.extract(escaped) - handle_urls(m, urls, params[:urls].length) + Thread.new { handle_urls(m, urls, params[:urls].length) } end def listen(m) @@ -212,8 +185,9 @@ class UrlPlugin < Plugin return if m.address? escaped = URI.escape(m.message, OUR_UNSAFE) - urls = URI.extract(escaped) - handle_urls(m, urls) + urls = URI.extract(escaped, ['http', 'https']) + return if urls.empty? + Thread.new { handle_urls(m, urls) } end def reply_urls(opts={})