+
+ class ::UrlLinkError < RuntimeError
+ end
+
+ # This method extracts title, content (first par) and extra
+ # information from the given Net::HTTPResponse _resp_.
+ #
+ # Currently, the only accepted options (in _opts_) are
+ # uri_fragment:: the URI fragment of the original request
+ # full_body:: get the whole body instead of
+ # @@bot.config['http.info_bytes'] bytes only
+ #
+ # Returns a DataStream with the following keys:
+ # text:: the (partial) body
+ # title:: the title of the document (if any)
+ # content:: the first paragraph of the document (if any)
+ # headers::
+ # the headers of the Net::HTTPResponse. The value is
+ # a Hash whose keys are lowercase forms of the HTTP
+ # header fields, and whose values are Arrays.
+ #
+ def Utils.get_resp_html_info(resp, opts={})
+ case resp
+ when Net::HTTPSuccess
+ loc = URI.parse(resp['x-rbot-location'] || resp['location']) rescue nil
+ if loc and loc.fragment and not loc.fragment.empty?
+ opts[:uri_fragment] ||= loc.fragment
+ end
+ ret = DataStream.new(opts.dup)
+ ret[:headers] = resp.to_hash
+ ret[:text] = partial = opts[:full_body] ? resp.body : resp.partial_body(@@bot.config['http.info_bytes'])
+
+ filtered = Utils.try_htmlinfo_filters(ret)
+
+ if filtered
+ return filtered
+ elsif resp['content-type'] =~ /^text\/|(?:x|ht)ml/
+ ret.merge!(Utils.get_string_html_info(partial, opts))
+ end
+ return ret
+ else
+ raise UrlLinkError, "getting link (#{resp.code} - #{resp.message})"
+ end
+ end
+
+ # This method runs an appropriately-crafted DataStream _ds_ through the
+ # filters in the :htmlinfo filter group, in order. If one of the filters
+ # returns non-nil, its results are merged in _ds_ and returned. Otherwise
+ # nil is returned.
+ #
+ # The input DataStream should have the downloaded HTML as primary key
+ # (:text) and possibly a :headers key holding the resonse headers.
+ #
+ def Utils.try_htmlinfo_filters(ds)
+ filters = @@bot.filter_names(:htmlinfo)
+ return nil if filters.empty?
+ cur = nil
+ # TODO filter priority
+ filters.each { |n|
+ debug "testing htmlinfo filter #{n}"
+ cur = @@bot.filter(@@bot.global_filter_name(n, :htmlinfo), ds)
+ debug "returned #{cur.pretty_inspect}"
+ break if cur
+ }
+ return ds.merge(cur) if cur
+ end
+
+ # HTML info filters often need to check if the webpage location
+ # of a passed DataStream _ds_ matches a given Regexp.
+ def Utils.check_location(ds, rx)
+ debug ds[:headers]
+ if h = ds[:headers]
+ loc = [h['x-rbot-location'],h['location']].flatten.grep(rx)
+ end
+ loc ||= []
+ debug loc
+ return loc.empty? ? nil : loc
+ end
+
+ # This method extracts title and content (first par)
+ # from the given HTML or XML document _text_, using
+ # standard methods (String#ircify_html_title,
+ # Utils.ircify_first_html_par)
+ #
+ # Currently, the only accepted option (in _opts_) is
+ # uri_fragment:: the URI fragment of the original request
+ #
+ def Utils.get_string_html_info(text, opts={})
+ debug "getting string html info"
+ txt = text.dup
+ title = txt.ircify_html_title
+ debug opts
+ if frag = opts[:uri_fragment] and not frag.empty?
+ fragreg = /<a\s+(?:[^>]+\s+)?(?:name|id)=["']?#{frag}["']?[^>]*>/im
+ debug fragreg
+ debug txt
+ if txt.match(fragreg)
+ # grab the post-match
+ txt = $'
+ end
+ debug txt
+ end
+ c_opts = opts.dup
+ c_opts[:strip] ||= title
+ content = Utils.ircify_first_html_par(txt, c_opts)
+ content = nil if content.empty?
+ return {:title => title, :content => content}
+ end
+
+ # Get the first pars of the first _count_ _urls_.
+ # The pages are downloaded using the bot httputil service.
+ # Returns an array of the first paragraphs fetched.
+ # If (optional) _opts_ :message is specified, those paragraphs are
+ # echoed as replies to the IRC message passed as _opts_ :message
+ #
+ def Utils.get_first_pars(urls, count, opts={})
+ idx = 0
+ msg = opts[:message]
+ retval = Array.new
+ while count > 0 and urls.length > 0
+ url = urls.shift
+ idx += 1
+
+ begin
+ info = Utils.get_html_info(URI.parse(url), opts)
+
+ par = info[:content]
+ retval.push(par)
+
+ if par
+ msg.reply "[#{idx}] #{par}", :overlong => :truncate if msg
+ count -=1
+ end
+ rescue
+ debug "Unable to retrieve #{url}: #{$!}"
+ next
+ end
+ end
+ return retval
+ end
+
+ # Returns a comma separated list except for the last element
+ # which is joined in with specified conjunction
+ #
+ def Utils.comma_list(words, options={})
+ defaults = { :join_with => ", ", :join_last_with => _(" and ") }
+ opts = defaults.merge(options)
+
+ if words.size < 2
+ words.last
+ else
+ [words[0..-2].join(opts[:join_with]), words.last].join(opts[:join_last_with])
+ end
+ end
+