X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=data%2Frbot%2Fplugins%2Fsearch.rb;h=3b7d682684f7a6f3be3dfcd9a3e6f55858c4ded0;hb=edd1cf77be07ae507014574141e920ad23eb164d;hp=27f9519c37b4d5e3bec5bbda080380526371bed9;hpb=6fe052181de09dd4a618241e600e2473f6706343;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/data/rbot/plugins/search.rb b/data/rbot/plugins/search.rb index 27f9519c..3b7d6826 100644 --- a/data/rbot/plugins/search.rb +++ b/data/rbot/plugins/search.rb @@ -1,42 +1,24 @@ +#-- vim:sw=2:et +#++ +# +# :title: Google and Wikipedia search plugin for rbot +# +# Author:: Tom Gilbert (giblet) +# Author:: Giuseppe "Oblomov" Bilotta +# +# Copyright:: (C) 2002-2005 Tom Gilbert +# Copyright:: (C) 2006 Tom Gilbert, Giuseppe Bilotta +# Copyright:: (C) 2006-2007 Giuseppe Bilotta + +# TODO:: use lr=lang_ or whatever is most appropriate to let google know +# it shouldn't use the bot's location to find the preferred language + require 'uri' Net::HTTP.version_1_2 GOOGLE_WAP_LINK = /(.*?)<\/a>/im -class ::String - def ircify_html - txt = self - - # bold and strong -> bold - txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}") - - # italic, emphasis and underline -> underline - txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}") - - ## This would be a nice addition, but the results are horrible - ## Maybe make it configurable? - # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}") - - # Paragraph and br tags are converted to whitespace. - txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ') - txt.gsub!("\n", ' ') - - # All other tags are just removed - txt.gsub!(/<[^>]+>/, '') - - # Remove double formatting options, since they only waste bytes - txt.gsub!(/#{Bold}\s*#{Bold}/,"") - txt.gsub!(/#{Underline}\s*#{Underline}/,"") - - # And finally whitespace is squeezed - txt.gsub!(/\s+/, ' ') - - # Decode entities and strip whitespace - return Utils.decode_html_entities(txt).strip! - end -end - class SearchPlugin < Plugin BotConfig.register BotConfigIntegerValue.new('google.hits', :default => 3, @@ -103,51 +85,10 @@ class SearchPlugin < Plugin first_pars = params[:firstpar] || @bot.config['google.first_par'] - idx = 0 - while first_pars > 0 and urls.length > 0 - url.replace(urls.shift) - idx += 1 - - # FIXME what happens if some big file is returned? We should share - # code with the url plugin to only retrieve partial file content! - xml = @bot.httputil.get_cached(url) - if xml.nil? - debug "Unable to retrieve #{url}" - next - end - # We get the first par after the first main heading, if possible - header_found = xml.match(/]*)?>(.*?)<\/h1>/im) - txt = String.new - if header_found - debug "Found header: #{header_found[1].inspect}" - while txt.empty? - header_found = $' - candidate = header_found[/]*)?>.*?<\/p>/im] - break unless candidate - txt.replace candidate.ircify_html - end - end - # If we haven't found a first par yet, try to get it from the whole - # document - if txt.empty? - header_found = xml - while txt.empty? - candidate = header_found[/]*)?>.*?<\/p>/im] - break unless candidate - txt.replace candidate.ircify_html - header_found = $' - end - end - # Nothing yet, try title - if txt.empty? - debug "No first par found\n#{xml}" - # FIXME only do this if the 'url' plugin is loaded - txt.replace @bot.plugins['url'].get_title_from_html(xml) - next if txt.empty? - end - m.reply "[#{idx}] #{txt}", :overlong => :truncate - first_pars -=1 - end + return unless first_pars > 0 + + Utils.get_first_pars urls, first_pars, :http_util => @bot.httputil, :message => m + end def wikipedia(m, params)