+
+ pre_h = pars = by_span = nil
+
+ while true
+ debug "Minimum number of spaces: #{min_spaces}"
+
+ # Initial attempt: <p> that follows <h\d>
+ if pre_h.nil?
+ pre_h = Hpricot::Elements[]
+ found_h = false
+ doc.search("*") { |e|
+ next if e.bogusetag?
+ case e.pathname
+ when /^h\d/
+ found_h = true
+ when 'p'
+ pre_h << e if found_h
+ end
+ }
+ debug "Hx: found: #{pre_h.pretty_inspect}"
+ end
+
+ pre_h.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Second natural attempt: just get any <p>
+ pars = doc/"p" if pars.nil?
+ debug "par: found: #{pars.pretty_inspect}"
+ pars.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Nothing yet ... let's get drastic: we look for non-par elements too,
+ # but only for those that match something that we know is likely to
+ # contain text
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or
+ # 'message' or 'text' in their class to mark actual text. Since we want
+ # the class match to be partial and case insensitive, we collect
+ # the common elements that may have this class and then filter out those
+ # we don't need. If no divs or spans are found, we'll accept additional
+ # elements too (td, tr, tbody, table).
+ if by_span.nil?
+ by_span = Hpricot::Elements[]
+ extra = Hpricot::Elements[]
+ doc.search("*") { |el|
+ next if el.bogusetag?
+ case el.pathname
+ when AFTER_PAR_PATH
+ by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+ when AFTER_PAR_EX
+ extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+ end
+ }
+ if by_span.empty? and not extra.empty?
+ by_span.concat extra
+ end
+ debug "other \#1: found: #{by_span.pretty_inspect}"
+ end
+
+ by_span.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # At worst, we can try stuff which is comprised between two <br>
+ # TODO
+
+ debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
+ return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
+ min_spaces /= 2
+ end
+ end
+
+ # HTML first par grabber without hpricot
+ def Utils.ircify_first_html_par_woh(xml_org, opts={})
+ xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+