X-Git-Url: https://git.netwichtig.de/gitweb/?a=blobdiff_plain;f=data%2Frbot%2Fplugins%2Fmarkov.rb;h=abb10172d639db03f91e98386285c40bfdb6b5ef;hb=3d8bdf551aebdd4fa7ddb10fa8e824232dd4f82b;hp=22981f683a6b4f2bb62c3247fb7f092dd5facbb8;hpb=6f1d91a2ea5e86732f878553e474c5fd77012b17;p=user%2Fhenk%2Fcode%2Fruby%2Frbot.git diff --git a/data/rbot/plugins/markov.rb b/data/rbot/plugins/markov.rb old mode 100755 new mode 100644 index 22981f68..abb10172 --- a/data/rbot/plugins/markov.rb +++ b/data/rbot/plugins/markov.rb @@ -20,10 +20,194 @@ class MarkovPlugin < Plugin Config.register Config::ArrayValue.new('markov.ignore', :default => [], :desc => "Hostmasks and channel names markov should NOT learn from (e.g. idiot*!*@*, #privchan).") + Config.register Config::ArrayValue.new('markov.readonly', + :default => [], + :desc => "Hostmasks and channel names markov should NOT talk to (e.g. idiot*!*@*, #privchan).") Config.register Config::IntegerValue.new('markov.max_words', :default => 50, :validate => Proc.new { |v| (0..100).include? v }, :desc => "Maximum number of words the bot should put in a sentence") + Config.register Config::FloatValue.new('markov.learn_delay', + :default => 0.5, + :validate => Proc.new { |v| v >= 0 }, + :desc => "Time the learning thread spends sleeping after learning a line. If set to zero, learning from files can be very CPU intensive, but also faster.") + Config.register Config::IntegerValue.new('markov.delay', + :default => 5, + :validate => Proc.new { |v| v >= 0 }, + :desc => "Wait short time before contributing to conversation.") + Config.register Config::IntegerValue.new('markov.answer_addressed', + :default => 50, + :validate => Proc.new { |v| (0..100).include? v }, + :desc => "Probability of answer when addressed by nick") + Config.register Config::ArrayValue.new('markov.ignore_patterns', + :default => [], + :desc => "Ignore these word patterns") + + MARKER = :"\r\n" + + # upgrade a registry entry from 0.9.14 and earlier, converting the Arrays + # into Hashes of weights + def upgrade_entry(k, logfile) + logfile.puts "\t#{k.inspect}" + logfile.flush + logfile.fsync + + ar = @registry[k] + + # wipe the current key + @registry.delete(k) + + # discard empty keys + if ar.empty? + logfile.puts "\tEMPTY" + return + end + + # otherwise, proceed + logfile.puts "\t#{ar.inspect}" + + # re-encode key to UTF-8 and cleanup as needed + words = k.split.map do |w| + BasicUserMessage.strip_formatting( + @bot.socket.filter.in(w) + ).sub(/\001$/,'') + end + + # old import that failed to split properly? + if words.length == 1 and words.first.include? '/' + # split at the last / + unsplit = words.first + at = unsplit.rindex('/') + words = [unsplit[0,at], unsplit[at+1..-1]] + end + + # if any of the re-split/re-encoded words have spaces, + # or are empty, we would get a chain we can't convert, + # so drop it + if words.first.empty? or words.first.include?(' ') or + words.last.empty? or words.last.include?(' ') + logfile.puts "\tSKIPPED" + return + end + + # former unclean CTCP, we can't convert this + if words.first[0] == 1 + logfile.puts "\tSKIPPED" + return + end + + # nonword CTCP => SKIP + # someword CTCP => nonword someword + if words.last[0] == 1 + if words.first == "nonword" + logfile.puts "\tSKIPPED" + return + end + words.unshift MARKER + words.pop + end + + # intern the old keys + words.map! do |w| + ['nonword', MARKER].include?(w) ? MARKER : w.chomp("\001") + end + + newkey = words.join(' ') + logfile.puts "\t#{newkey.inspect}" + + # the new key exists already, so we want to merge + if k != newkey and @registry.key? newkey + ar2 = @registry[newkey] + logfile.puts "\tMERGE" + logfile.puts "\t\t#{ar2.inspect}" + ar.push(*ar2) + # and get rid of the key + @registry.delete(newkey) + end + + total = 0 + hash = Hash.new(0) + + @chains_mutex.synchronize do + if @chains.key? newkey + ar2 = @chains[newkey] + total += ar2.first + hash.update ar2.last + end + + ar.each do |word| + case word + when :nonword + # former marker + sym = MARKER + else + # we convert old words into UTF-8, cleanup, resplit if needed, + # and only get the first word. we may lose some data for old + # missplits, but this is the best we can do + w = BasicUserMessage.strip_formatting( + @bot.socket.filter.in(word).split.first + ) + case w + when /^\001\S+$/, "\001", "" + # former unclean CTCP or end of CTCP + next + else + # intern after clearing leftover end-of-actions if present + sym = w.chomp("\001") + end + end + hash[sym] += 1 + total += 1 + end + if hash.empty? + logfile.puts "\tSKIPPED" + return + end + logfile.puts "\t#{[total, hash].inspect}" + @chains[newkey] = [total, hash] + end + end + + def upgrade_registry + # we load all the keys and then iterate over this array because + # running each() on the registry and updating it at the same time + # doesn't work + keys = @registry.keys + # no registry, nothing to do + return if keys.empty? + + ki = 0 + log "starting markov database conversion thread (v1 to v2, #{keys.length} keys)" + + keys.each { |k| @upgrade_queue.push k } + @upgrade_queue.push nil + + @upgrade_thread = Thread.new do + @registry.recovery = Proc.new { |val| + return [val] + } + logfile = File.open(@bot.path('markov-conversion.log'), 'a') + logfile.puts "=== conversion thread started #{Time.now} ===" + while k = @upgrade_queue.pop + ki += 1 + logfile.puts "Key #{ki} (#{@upgrade_queue.length} in queue):" + begin + upgrade_entry(k, logfile) + rescue Exception => e + logfile.puts "=== ERROR ===" + logfile.puts e.pretty_inspect + logfile.puts "=== EREND ===" + end + sleep @bot.config['markov.learn_delay'] unless @bot.config['markov.learn_delay'].zero? + end + logfile.puts "=== conversion thread stopped #{Time.now} ===" + logfile.close + @registry.recovery = nil + end + @upgrade_thread.priority = -1 + end + + attr_accessor :chains def initialize super @@ -41,82 +225,162 @@ class MarkovPlugin < Plugin @bot.config['markov.ignore'] = @bot.config['markov.ignore_users'].dup @bot.config.delete('markov.ignore_users'.to_sym) end + + @chains = @registry.sub_registry('v2') + @chains.set_default([]) + @rchains = @registry.sub_registry('v2r') + @rchains.set_default([]) + @chains_mutex = Mutex.new + @rchains_mutex = Mutex.new + + @upgrade_queue = Queue.new + @upgrade_thread = nil + upgrade_registry + @learning_queue = Queue.new @learning_thread = Thread.new do while s = @learning_queue.pop learn_line s - sleep 0.5 + sleep @bot.config['markov.learn_delay'] unless @bot.config['markov.learn_delay'].zero? end end @learning_thread.priority = -1 end def cleanup + if @upgrade_thread and @upgrade_thread.alive? + debug 'closing conversion thread' + @upgrade_queue.clear + @upgrade_queue.push nil + @upgrade_thread.join + debug 'conversion thread closed' + end + debug 'closing learning thread' + @learning_queue.clear @learning_queue.push nil @learning_thread.join debug 'learning thread closed' + @chains.close + @rchains.close + super + end + + # pick a word from the registry using the pair as key. + def pick_word(word1, word2=MARKER, chainz=@chains) + k = "#{word1} #{word2}" + return MARKER unless chainz.key? k + wordlist = chainz[k] + pick_word_from_list wordlist + end + + # pick a word from weighted hash + def pick_word_from_list(wordlist) + total = wordlist.first + hash = wordlist.last + return MARKER if total == 0 + return hash.keys.first if hash.length == 1 + hit = rand(total) + ret = MARKER + hash.each do |k, w| + hit -= w + if hit < 0 + ret = k + break + end + end + return ret end def generate_string(word1, word2) # limit to max of markov.max_words words if word2 - output = "#{word1} #{word2}" + output = [word1, word2] else - output = word1.to_s - end - - if @registry.key? output - wordlist = @registry[output] - wordlist.delete(:nonword) - else - output.downcase! + output = word1 keys = [] - @registry.each_key(output) do |key| + @chains.each_key(output) do |key| if key.downcase.include? output keys << key else break end end - if keys.empty? - keys = @registry.keys.select { |k| k.downcase.include? output } - end return nil if keys.empty? - while key = keys.delete_one - wordlist = @registry[key] - wordlist.delete(:nonword) - unless wordlist.empty? - output = key - word1, word2 = output.split - break - end + output = keys[rand(keys.size)].split(/ /) + end + output = output.split(/ /) unless output.is_a? Array + input = [word1, word2] + while output.length < @bot.config['markov.max_words'] and (output.first != MARKER or output.last != MARKER) do + if output.last != MARKER + output << pick_word(output[-2], output[-1]) + end + if output.first != MARKER + output.insert 0, pick_word(output[0], output[1], @rchains) end end - return nil if wordlist.empty? - - word3 = wordlist.pick_one - output << " #{word3}" - word1, word2 = word2, word3 - - (@bot.config['markov.max_words'] - 1).times do - wordlist = @registry["#{word1} #{word2}"] - break if wordlist.empty? - word3 = wordlist.pick_one - break if word3 == :nonword - output << " #{word3}" - word1, word2 = word2, word3 + output.delete MARKER + if output == input + nil + else + output.join(" ") end - return output end def help(plugin, topic="") - _("markov plugin: listens to chat to build a markov chain, with which it can (perhaps) attempt to (inanely) contribute to 'discussion'. Sort of.. Will get a *lot* better after listening to a lot of chat. usage: 'markov' to attempt to say something relevant to the last line of chat, if it can. other options to markov: 'ignore' => ignore a hostmask (accept no input), 'status' => show current status, 'probability []' => set the % chance of rbot responding to input, or display the current probability, 'chat' => try and say something intelligent, 'chat about ' => riff on a word pair (if possible)") + topic, subtopic = topic.split + + case topic + when "delay" + "markov delay => Set message delay" + when "ignore" + case subtopic + when "add" + "markov ignore add => ignore a hostmask or a channel" + when "list" + "markov ignore list => show ignored hostmasks and channels" + when "remove" + "markov ignore remove => unignore a hostmask or channel" + else + "ignore hostmasks or channels -- topics: add, remove, list" + end + when "readonly" + case subtopic + when "add" + "markov readonly add => read-only a hostmask or a channel" + when "list" + "markov readonly list => show read-only hostmasks and channels" + when "remove" + "markov readonly remove => unreadonly a hostmask or channel" + else + "restrict hostmasks or channels to read only -- topics: add, remove, list" + end + when "status" + "markov status => show if markov is enabled, probability and amount of messages in queue for learning" + when "probability" + "markov probability [] => set the % chance of rbot responding to input, or display the current probability" + when "chat" + case subtopic + when "about" + "markov chat about [] => talk about or riff on a word pair (if possible)" + else + "markov chat => try to say something intelligent" + end + when "learn" + ["markov learn from [testing [ lines]] [using pattern ]:", + "learn from the text in the specified , optionally using the given to filter the text.", + "you can sample what would be learned by specifying 'testing lines'"].join(' ') + else + "markov plugin: listens to chat to build a markov chain, with which it can (perhaps) attempt to (inanely) contribute to 'discussion'. Sort of.. Will get a *lot* better after listening to a lot of chat. Usage: 'chat' to attempt to say something relevant to the last line of chat, if it can -- help topics: ignore, readonly, delay, status, probability, chat, chat about, learn" + end end - def clean_str(s) - str = s.dup - str.gsub!(/^\S+[:,;]/, "") + def clean_message(m) + str = m.plainmessage.dup + str =~ /^(\S+)([:,;])/ + if $1 and m.target.is_a? Irc::Channel and m.target.user_nicks.include? $1.downcase + str.gsub!(/^(\S+)([:,;])\s+/, "") + end str.gsub!(/\s{2,}/, ' ') # fix for two or more spaces return str.strip end @@ -127,9 +391,11 @@ class MarkovPlugin < Plugin def status(m,params) if @bot.config['markov.enabled'] - reply = _("markov is currently enabled, %{p}% chance of chipping in") % { :p => probability? } + reply = _("markov is currently enabled, %{p}%% chance of chipping in") % { :p => probability? } l = @learning_queue.length reply << (_(", %{l} messages in queue") % {:l => l}) if l > 0 + l = @upgrade_queue.length + reply << (_(", %{l} chains to upgrade") % {:l => l}) if l > 0 else reply = _("markov is currently disabled") end @@ -138,7 +404,8 @@ class MarkovPlugin < Plugin def ignore?(m=nil) return false unless m - return true if m.address? or m.private? + return true if m.private? + return true if m.prefixed? @bot.config['markov.ignore'].each do |mask| return true if m.channel.downcase == mask.downcase return true if m.source.matches?(mask) @@ -146,11 +413,20 @@ class MarkovPlugin < Plugin return false end + def readonly?(m=nil) + return false unless m + @bot.config['markov.readonly'].each do |mask| + return true if m.channel.downcase == mask.downcase + return true if m.source.matches?(mask) + end + return false + end + def ignore(m, params) action = params[:action] user = params[:option] case action - when 'remove': + when 'remove' if @bot.config['markov.ignore'].include? user s = @bot.config['markov.ignore'] s.delete user @@ -159,7 +435,7 @@ class MarkovPlugin < Plugin else m.reply _("not found in list") end - when 'add': + when 'add' if user if @bot.config['markov.ignore'].include?(user) m.reply _("%{u} already in list") % { :u => user } @@ -170,13 +446,44 @@ class MarkovPlugin < Plugin else m.reply _("give the name of a person or channel to ignore") end - when 'list': + when 'list' m.reply _("I'm ignoring %{ignored}") % { :ignored => @bot.config['markov.ignore'].join(", ") } else m.reply _("have markov ignore the input from a hostmask or a channel. usage: markov ignore add ; markov ignore remove ; markov ignore list") end end + def readonly(m, params) + action = params[:action] + user = params[:option] + case action + when 'remove' + if @bot.config['markov.readonly'].include? user + s = @bot.config['markov.readonly'] + s.delete user + @bot.config['markov.readonly'] = s + m.reply _("%{u} removed") % { :u => user } + else + m.reply _("not found in list") + end + when 'add' + if user + if @bot.config['markov.readonly'].include?(user) + m.reply _("%{u} already in list") % { :u => user } + else + @bot.config['markov.readonly'] = @bot.config['markov.readonly'].push user + m.reply _("%{u} added to markov readonly list") % { :u => user } + end + else + m.reply _("give the name of a person or channel to read only") + end + when 'list' + m.reply _("I'm only reading %{readonly}") % { :readonly => @bot.config['markov.readonly'].join(", ") } + else + m.reply _("have markov not answer to input from a hostmask or a channel. usage: markov readonly add ; markov readonly remove ; markov readonly list") + end + end + def enable(m, params) @bot.config['markov.enabled'] = true m.okay @@ -187,7 +494,7 @@ class MarkovPlugin < Plugin @bot.config['markov.probability'] = params[:probability].to_i m.okay else - m.reply _("markov has a %{prob}% chance of chipping in") % { :prob => probability? } + m.reply _("markov has a %{prob}%% chance of chipping in") % { :prob => probability? } end end @@ -196,30 +503,74 @@ class MarkovPlugin < Plugin m.okay end - def should_talk + def should_talk(m) return false unless @bot.config['markov.enabled'] - prob = probability? + prob = m.address? ? @bot.config['markov.answer_addressed'] : probability? return true if prob > rand(100) return false end - def delay - 1 + rand(5) + # Generates all sequence pairs from array + # seq_pairs [1,2,3,4] == [ [1,2], [2,3], [3,4]] + def seq_pairs(arr) + res = [] + 0.upto(arr.size-2) do |i| + res << [arr[i], arr[i+1]] + end + res end - def random_markov(m, message) - return unless should_talk - - word1, word2 = message.split(/\s+/) - return unless word1 and word2 - line = generate_string(word1, word2) - return unless line - # we do nothing if the line we return is just an initial substring - # of the line we received - return if message.index(line) == 0 - @bot.timer.add_once(delay) { + def set_delay(m, params) + if params[:delay] == "off" + @bot.config["markov.delay"] = 0 + m.okay + elsif !params[:delay] + m.reply _("Message delay is %{delay}" % { :delay => @bot.config["markov.delay"]}) + else + @bot.config["markov.delay"] = params[:delay].to_i + m.okay + end + end + + def reply_delay(m, line) + m.replied = true + if @bot.config['markov.delay'] > 0 + @bot.timer.add_once(1 + rand(@bot.config['markov.delay'])) { + m.reply line, :nick => false, :to => :public + } + else m.reply line, :nick => false, :to => :public - } + end + end + + def random_markov(m, message) + return unless should_talk(m) + + words = clean_message(m).split(/\s+/) + if words.length < 2 + line = generate_string words.first, nil + + if line and message.index(line) != 0 + reply_delay m, line + return + end + else + pairs = seq_pairs(words).sort_by { rand } + pairs.each do |word1, word2| + line = generate_string(word1, word2) + if line and message.index(line) != 0 + reply_delay m, line + return + end + end + words.sort_by { rand }.each do |word| + line = generate_string word.first, nil + if line and message.index(line) != 0 + reply_delay m, line + return + end + end + end end def chat(m, params) @@ -233,13 +584,11 @@ class MarkovPlugin < Plugin def rand_chat(m, params) # pick a random pair from the db and go from there - word1, word2 = :nonword, :nonword + word1, word2 = MARKER, MARKER output = Array.new - 50.times do - wordlist = @registry["#{word1} #{word2}"] - break if wordlist.empty? - word3 = wordlist[rand(wordlist.length)] - break if word3 == :nonword + @bot.config['markov.max_words'].times do + word3 = pick_word(word1, word2) + break if word3 == MARKER output << word3 word1, word2 = word2, word3 end @@ -258,28 +607,62 @@ class MarkovPlugin < Plugin return if ignore? m # in channel message, the kind we are interested in - message = clean_str m.plainmessage + message = m.plainmessage if m.action? message = "#{m.sourcenick} #{message}" end - learn message - random_markov(m, message) unless m.replied? + random_markov(m, message) unless readonly? m or m.replied? + learn clean_message(m) end + + def learn_triplet(word1, word2, word3) + k = "#{word1} #{word2}" + rk = "#{word2} #{word3}" + @chains_mutex.synchronize do + total = 0 + hash = Hash.new(0) + if @chains.key? k + t2, h2 = @chains[k] + total += t2 + hash.update h2 + end + hash[word3] += 1 + total += 1 + @chains[k] = [total, hash] + end + @rchains_mutex.synchronize do + # Reverse + total = 0 + hash = Hash.new(0) + if @rchains.key? rk + t2, h2 = @rchains[rk] + total += t2 + hash.update h2 + end + hash[word1] += 1 + total += 1 + @rchains[rk] = [total, hash] + end + end + + def learn_line(message) - # debug "learning #{message}" - wordlist = message.split(/\s+/) + # debug "learning #{message.inspect}" + wordlist = message.strip.split(/\s+/).reject do |w| + @bot.config['markov.ignore_patterns'].map do |pat| + w =~ Regexp.new(pat.to_s) + end.select{|v| v}.size != 0 + end return unless wordlist.length >= 2 - word1, word2 = :nonword, :nonword + word1, word2 = MARKER, MARKER + wordlist << MARKER wordlist.each do |word3| - k = "#{word1} #{word2}" - @registry[k] = @registry[k].push(word3) + learn_triplet(word1, word2, word3.to_sym) word1, word2 = word2, word3 end - k = "#{word1} #{word2}" - @registry[k] = @registry[k].push(:nonword) end # TODO allow learning from URLs @@ -346,18 +729,30 @@ class MarkovPlugin < Plugin m.okay end + + def stats(m, params) + m.reply "Markov status: chains: #{@chains.length} forward, #{@rchains.length} reverse, queued phrases: #{@learning_queue.size}" + end + end plugin = MarkovPlugin.new +plugin.map 'markov delay :delay', :action => "set_delay" +plugin.map 'markov delay', :action => "set_delay" plugin.map 'markov ignore :action :option', :action => "ignore" plugin.map 'markov ignore :action', :action => "ignore" plugin.map 'markov ignore', :action => "ignore" +plugin.map 'markov readonly :action :option', :action => "readonly" +plugin.map 'markov readonly :action', :action => "readonly" +plugin.map 'markov readonly', :action => "readonly" plugin.map 'markov enable', :action => "enable" plugin.map 'markov disable', :action => "disable" plugin.map 'markov status', :action => "status" -plugin.map 'chat about :seed1 [:seed2]', :action => "chat" +plugin.map 'markov stats', :action => "stats" +plugin.map 'chat about :seed1 [:seed2]', :action => "chat", :defaults => {:seed2 => nil} plugin.map 'chat', :action => "rand_chat" plugin.map 'markov probability [:probability]', :action => "probability", + :defaults => {:probability => nil}, :requirements => {:probability => /^\d+%?$/} plugin.map 'markov learn from :file [:testing [:lines lines]] [using pattern *pattern]', :action => "learn_from", :thread => true, :requirements => {