imdb plugin: fix detection of multiple titles per year

[user/henk/code/ruby/rbot.git] / data / rbot / plugins / imdb.rb
diff --git a/data/rbot/plugins/imdb.rb b/data/rbot/plugins/imdb.rb

index 5615ac00c4dd7e4cb07fc1a8b807738a8501d77d..7887655dfd641da538940a5f798fad2ce9029f87 100644 (file)
--- a/data/rbot/plugins/imdb.rb
+++ b/data/rbot/plugins/imdb.rb
@@ -4,28 +4,36 @@
  # :title: IMDB plugin for rbot
  #
  # Author:: Arnaud Cornet <arnaud.cornet@gmail.com>
+# Author:: Giuseppe "Oblomov" Bilotta <giuseppe.bilotta@gmail.com>
+#
  # Copyright:: (C) 2005 Arnaud Cornet
-# License:: MIT license
+# Copyright:: (C) 2007 Giuseppe Bilotta
  #
-# Notes by Giuseppe Bilotta:
-# TODO return more than one match (configurable)
+# License:: MIT license
  
  require 'uri/common'
  
  class Imdb
+  IMDB = "http://us.imdb.com"
+  TITLE_OR_NAME_MATCH = /<a href="(\/(?:title|name)\/(?:tt|nm)[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/
+  TITLE_MATCH = /<a href="(\/title\/tt[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/
+  NAME_MATCH = /<a href="(\/name\/nm[0-9]+\/?)[^"]*"(?:[^>]*)>([^<]*)<\/a>/
+  FINAL_ARTICLE_MATCH = /, ([A-Z]\S{0,2})$/
+
    def initialize(bot)
      @bot = bot
    end
  
    def search(rawstr)
-    str = URI.escape(rawstr) << ";site=aka"
+    str = URI.escape(rawstr)
+    str << ";site=aka" if @bot.config['imdb.aka']
      return do_search(str)
    end
  
    def do_search(str)
      resp = nil
      begin
-      resp = @bot.httputil.get_response("http://us.imdb.com/find?q=#{str}",
+      resp = @bot.httputil.get_response(IMDB + "/find?q=#{str}",
                                          :max_redir => -1)
      rescue Exception => e
        error e.message
@@ -34,77 +42,123 @@ class Imdb
      end
  
      if resp.code == "200"
-      m = /<a href="(\/(?:title|name)\/(?:tt|nm)[0-9]+\/?)[^"]*"(?:[^>]*)>(?:[^<]*)<\/a>/.match(resp.body)
-      if m
-        url = m[1]
-        return url
+      m = []
+      m << TITLE_OR_NAME_MATCH.match(resp.body) if @bot.config['imdb.popular']
+      if resp.body.match(/\(Exact Matches\)<\/b>/) and @bot.config['imdb.exact']
+        m << TITLE_OR_NAME_MATCH.match($')
+      end
+      m.compact!
+      unless m.empty?
+        return m.map { |mm|
+          mm[1]
+        }.uniq
        end
      elsif resp.code == "302"
-      new_loc = resp['location'].gsub(/http:\/\/us.imdb.com/, "")
+      debug "automatic redirection"
+      new_loc = resp['location'].gsub(IMDB, "")
        if new_loc.match(/\/find\?q=(.*)/)
          return do_search($1)
        else
-        return new_loc.gsub(/\?.*/, "")
+        return [new_loc.gsub(/\?.*/, "")]
        end
      end
      return nil
    end
  
    def info(rawstr)
-    sr = search(rawstr)
-    if !sr
+    urls = search(rawstr)
+    debug urls
+    if urls.nil_or_empty?
        debug "IMDB: search returned NIL"
        return nil
      end
-    type = sr.match(/^\/([^\/]+)\//)[1].downcase.intern rescue nil
-    case type
-    when :title
-      return info_title(sr)
-    when :name
-      return info_name(sr)
-    else
-      return "#{sr}"
-    end
+    results = []
+    urls.each { |sr|
+      type = sr.match(/^\/([^\/]+)\//)[1].downcase.intern rescue nil
+      case type
+      when :title
+        results << info_title(sr)
+      when :name
+        results << info_name(sr)
+      else
+        results << "#{sr}"
+      end
+    }
+    return results
    end
  
    def grab_info(info, body)
      /<div class="info">\s+<h5>#{info}:<\/h5>\s+(.*?)<\/div>/mi.match(body)[1] rescue nil
    end
  
+  def fix_article(org_tit)
+    title = org_tit.dup
+    if @bot.config['imdb.fix_article'] and title.gsub!(FINAL_ARTICLE_MATCH, '')
+      art = $1.dup
+      debug art.inspect
+      if art[-1,1].match(/[a-z]/)
+        art << " "
+      end
+      return art + title
+    end
+    return title
+  end
+
    def info_title(sr)
      resp = nil
      begin
-      resp = @bot.httputil.get_response('http://us.imdb.com' + sr,
-                                        :max_redir => -1)
+      resp = @bot.httputil.get_response(IMDB + sr, :max_redir => -1)
      rescue Exception => e
        error e.message
        warning e.backtrace.join("\n")
        return nil
      end
  
+    info = []
+
      if resp.code == "200"
        m = /<title>([^<]*)<\/title>/.match(resp.body)
        return nil if !m
-      title = Utils.decode_html_entities(m[1])
+      title_date = m[1]
+      pre_title, date, extra = title_date.scan(/^(.*)\((\d\d\d\d(?:\/[IV]+)?)\)\s*(.+)?$/).first
+      pre_title.strip!
+      title = fix_article(pre_title)
  
-      m = /<b>([0-9.]+)\/10<\/b>\n?\r?\s+<small>\(<a href="ratings">([0-9,]+) votes?<\/a>\)<\/small>/.match(resp.body)
-      return nil if !m
-      score = m[1]
-      votes = m[2]
+      dir = nil
+      data = grab_info(/Directors?/, resp.body)
+      if data
+        dir = data.scan(NAME_MATCH).map { |url, name|
+          name
+        }.join(', ')
+      end
  
-      plot = nil
-      data = grab_info(/Plot (?:Outline|Summary)/, resp.body)
+      country = nil
+      data = grab_info(/Country/, resp.body)
        if data
-        plot = "Plot: #{data.ircify_html.gsub(/\s+more$/,'')}"
+        country = data.ircify_html
+      end
+
+      info << [title, "(#{country}, #{date})", extra, dir ? "[#{dir}]" : nil, ": http://us.imdb.com#{sr}"].compact.join(" ")
+
+      ratings = "no votes"
+      m = /<b>([0-9.]+)\/10<\/b>\n?\r?\s+<small>\(<a href="ratings">([0-9,]+) votes?<\/a>\)<\/small>/.match(resp.body)
+      if m
+        ratings = "#{m[1]}/10 (#{m[2]} voters)"
        end
  
        genre = Array.new
        resp.body.scan(/<a href="\/Sections\/Genres\/[^\/]+\/">([^<]+)<\/a>/) do |gnr|
          genre << gnr
        end
-      info = "#{title} : http://us.imdb.com#{sr}\n"
-      info << "Ratings: #{score}/10 (#{votes} voters). Genre: #{genre.join('/')}\n"
-      info << plot if plot
+
+      plot = nil
+      data = grab_info(/Plot (?:Outline|Summary)/, resp.body)
+      if data
+        plot = "Plot: " + data.ircify_html.gsub(/\s+more$/,'')
+      end
+
+      info << ["Ratings: " << ratings, "Genre: " << genre.join('/') , plot].compact.join(". ")
+
        return info
      end
      return nil
@@ -113,18 +167,21 @@ class Imdb
    def info_name(sr)
      resp = nil
      begin
-      resp = @bot.httputil.get_response('http://us.imdb.com' + sr,
-                                        :max_redir => -1)
+      resp = @bot.httputil.get_response(IMDB + sr, :max_redir => -1)
      rescue Exception => e
        error e.message
        warning e.backtrace.join("\n")
        return nil
      end
  
+    info = []
+
      if resp.code == "200"
        m = /<title>([^<]*)<\/title>/.match(resp.body)
        return nil if !m
-      name = Utils.decode_html_entities(m[1])
+      name = m[1]
+
+      info << "#{name} : http://us.imdb.com#{sr}"
  
        birth = nil
        data = grab_info("Date of Birth", resp.body)
@@ -138,34 +195,46 @@ class Imdb
          death = "Death: #{data.ircify_html.gsub(/\s+more$/,'')}"
        end
  
+      info << [birth, death].compact.join('. ') if birth or death
+
        movies = {}
  
        filmorate = nil
        begin
-        filmorate = @bot.httputil.get("http://us.imdb.com" + sr + "filmorate")
+        filmorate = @bot.httputil.get(IMDB + sr + "filmorate")
        rescue Exception
        end
  
        if filmorate
          filmorate.scan(/<div class="filmo">.*?<a href="\/title.*?<\/div>/m) { |str|
            what = str.match(/<a name="[^"]+">([^<]+)<\/a>/)[1] rescue nil
-          # next unless what
-          next unless ['Actor', 'Director'].include?(what)
-          movies[what] = str.scan(/<a href="\/title\/[^"]+">([^<]+)<\/a>/)[0..2].map { |tit|
-            Utils.decode_html_entities(tit)
+          next unless what
+          movies[what] = str.scan(TITLE_MATCH)[0..2].map { |url, tit|
+            fix_article(tit)
            }
          }
        end
-      debug movies.inspect
  
-      info = "#{name} : http://us.imdb.com#{sr}\n"
-      info << [birth, death].compact.join('. ') << "\n"
+      preferred = ['Actor', 'Director']
+      if resp.body.match(/Jump to filmography as:&nbsp;(.*?)<\/div>/)
+        txt = $1
+        preferred = txt.scan(/<a[^>]+>([^<]+)<\/a>/)[0..2].map { |pref|
+          pref.first
+        }
+      end
+
        unless movies.empty?
-        info << "Top Movies:: "
+        all_keys = movies.keys.sort
+        debug all_keys.inspect
+        keys = []
+        preferred.each { |key|
+          keys << key if all_keys.include? key
+        }
+        keys = all_keys if keys.empty?
          ar = []
-        movies.keys.sort.each { |key|
+        keys.each { |key|
            ar << key.dup
-          ar.last << ": " + movies[key].join(', ')
+          ar.last << ": " + movies[key].join('; ')
          }
          info << ar.join('. ')
        end
@@ -177,6 +246,19 @@ class Imdb
  end
  
  class ImdbPlugin < Plugin
+  BotConfig.register BotConfigBooleanValue.new('imdb.aka',
+    :default => true,
+    :desc => "Look for IMDB matches also in translated titles and other 'also known as' information")
+  BotConfig.register BotConfigBooleanValue.new('imdb.popular',
+    :default => true,
+    :desc => "Display info on popular IMDB entries matching the request closely")
+  BotConfig.register BotConfigBooleanValue.new('imdb.exact',
+    :default => true,
+    :desc => "Display info on IMDB entries matching the request exactly")
+  BotConfig.register BotConfigBooleanValue.new('imdb.fix_article',
+    :default => false,
+    :desc => "Try to detect an article placed at the end and move it in front of the title")
+
    def help(plugin, topic="")
      "imdb <string> => search http://www.imdb.org for <string>"
    end
@@ -189,7 +271,13 @@ class ImdbPlugin < Plugin
        m.reply "Nothing found for #{what}"
        return nil
      end
-    m.reply info
+    if info.length == 1
+      m.reply Utils.decode_html_entities info.first.join("\n")
+    else
+      m.reply info.map { |i|
+        Utils.decode_html_entities i.join(" | ")
+      }.join("\n")
+    end
    end
  end