1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
|
require 'uri'
Net::HTTP.version_1_2
GOOGLE_WAP_LINK = /<a accesskey="(\d)" href=".*?u=(.*?)">(.*?)<\/a>/im
class ::String
def omissis_after(len)
if self.length > len
return self[0...len].sub(/\s+\S*$/,"...")
else
return self
end
end
def ircify_html
txt = self
# bold and strong -> bold
txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}")
# italic, emphasis and underline -> underline
txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}")
## This would be a nice addition, but the results are horrible
## Maybe make it configurable?
# txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}")
# Paragraph and br tags are converted to whitespace.
txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ')
txt.gsub!("\n", ' ')
# All other tags are just removed
txt.gsub!(/<[^>]+>/, '')
# Remove double formatting options, since they only waste bytes
txt.gsub!(/#{Bold}\s*#{Bold}/,"")
txt.gsub!(/#{Underline}\s*#{Underline}/,"")
# And finally whitespace is squeezed
txt.gsub!(/\s+/, ' ')
# Decode entities and strip whitespace
return Utils.decode_html_entities(txt).strip!
end
end
class SearchPlugin < Plugin
BotConfig.register BotConfigIntegerValue.new('google.hits',
:default => 3,
:desc => "Number of hits to return from Google searches")
BotConfig.register BotConfigIntegerValue.new('google.first_par',
:default => 0,
:desc => "When set to n > 0, the bot will return the first paragraph from the first n search hits")
BotConfig.register BotConfigIntegerValue.new('wikipedia.hits',
:default => 3,
:desc => "Number of hits to return from Wikipedia searches")
BotConfig.register BotConfigIntegerValue.new('wikipedia.first_par',
:default => 1,
:desc => "When set to n > 0, the bot will return the first paragraph from the first n wikipedia search hits")
def help(plugin, topic="")
case topic
when "search", "google"
"#{topic} <string> => search google for <string>"
when "wp"
"wp [<code>] <string> => search for <string> on Wikipedia. You can select a national <code> to only search the national Wikipedia"
else
"search <string> (or: google <string>) => search google for <string> | wp <string> => search for <string> on Wikipedia"
end
end
def google(m, params)
what = params[:words].to_s
searchfor = URI.escape what
# This method is also called by other methods to restrict searching to some sites
if params[:site]
site = "site:#{params[:site]}+"
else
site = ""
end
# It is also possible to choose a filter to remove constant parts from the titles
# e.g.: "Wikipedia, the free encyclopedia" when doing Wikipedia searches
filter = params[:filter] || ""
url = "http://www.google.com/wml/search?q=#{site}#{searchfor}"
hits = params[:hits] || @bot.config['google.hits']
begin
wml = @bot.httputil.get_cached(url)
rescue => e
m.reply "error googling for #{what}"
return
end
results = wml.scan(GOOGLE_WAP_LINK)
if results.length == 0
m.reply "no results found for #{what}"
return
end
urls = Array.new
results = results[0...hits].map { |res|
n = res[0]
t = Utils.decode_html_entities res[2].gsub(filter, '').strip
u = URI.unescape res[1]
urls.push(u)
"#{n}. #{Bold}#{t}#{Bold}: #{u}"
}.join(" | ")
m.reply "Results for #{what}: #{results}"
first_pars = params[:firstpar] || @bot.config['google.first_par']
idx = 0
while first_pars > 0 and urls.length > 0
url.replace(urls.shift)
idx += 1
# FIXME what happens if some big file is returned? We should share
# code with the url plugin to only retrieve partial file content!
xml = @bot.httputil.get_cached(url)
if xml.nil?
debug "Unable to retrieve #{url}"
next
end
# We get the first par after the first main heading, if possible
header_found = xml.match(/<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im)
txt = String.new
if header_found
debug "Found header: #{header_found[1].inspect}"
while txt.empty?
header_found = $'
candidate = header_found[/<p(?:\s+[^>]*)?>.*?<\/p>/im]
break unless candidate
txt.replace candidate.ircify_html
end
end
# If we haven't found a first par yet, try to get it from the whole
# document
if txt.empty?
header_found = xml
while txt.empty?
candidate = header_found[/<p(?:\s+[^>]*)?>.*?<\/p>/im]
break unless candidate
txt.replace candidate.ircify_html
header_found = $'
end
end
# Nothing yet, try title
if txt.empty?
debug "No first par found\n#{xml}"
# FIXME only do this if the 'url' plugin is loaded
txt.replace @bot.plugins['url'].get_title_from_html(xml)
next if txt.empty?
end
m.reply "[#{idx}] #{txt}".omissis_after(400)
first_pars -=1
end
end
def wikipedia(m, params)
lang = params[:lang]
site = "#{lang.nil? ? '' : lang + '.'}wikipedia.org"
debug "Looking up things on #{site}"
params[:site] = site
params[:filter] = / - Wikipedia.*$/
params[:hits] = @bot.config['wikipedia.hits']
params[:firstpar] = @bot.config['wikipedia.first_par']
return google(m, params)
end
end
plugin = SearchPlugin.new
plugin.map "search *words", :action => 'google'
plugin.map "google *words", :action => 'google'
plugin.map "wp :lang *words", :action => 'wikipedia', :requirements => { :lang => /^\w\w\w?$/ }
plugin.map "wp *words", :action => 'wikipedia'
|