diff options
author | Chris Gahan <chris@ill-logic.com> | 2006-02-03 15:41:14 +0000 |
---|---|---|
committer | Chris Gahan <chris@ill-logic.com> | 2006-02-03 15:41:14 +0000 |
commit | f2171f43f28ad8585ec5503bfa5094e053a482af (patch) | |
tree | c56482db34618ee1456bb650af39c7db35583ccd /data/rbot | |
parent | 90e7074ebda1180c702473c924434dd72a3dbf4a (diff) |
Since the CGI::unescapeHTML is pretty crappy, I added my own &thing;
translator (that uses IRC-printable characters).
Diffstat (limited to 'data/rbot')
-rw-r--r-- | data/rbot/plugins/url.rb | 35 |
1 files changed, 34 insertions, 1 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 1e46f633..95afa12d 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -5,6 +5,21 @@ require 'cgi' Url = Struct.new("Url", :channel, :nick, :time, :url) TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im +UNESCAPE_TABLE = { + 'raquo' => '>>', + '#8220' => '"', + '#8221' => '"', + '#8212' => '--', + '#39' => '\'', + '#174' => '(R)', + 'micro' => 'u', + '' => '', + '' => '', + '' => '', + '' => '', + #'' => '', +} + class UrlPlugin < Plugin BotConfig.register BotConfigIntegerValue.new('url.max_urls', :default => 100, :validate => Proc.new{|v| v > 0}, @@ -22,10 +37,28 @@ class UrlPlugin < Plugin "urls [<max>=4] => list <max> last urls mentioned in current channel, urls search [<max>=4] <regexp> => search for matching urls. In a private message, you must specify the channel to query, eg. urls <channel> [max], urls search <channel> [max] <regexp>" end + def unescape_title(htmldata) + # first pass -- let CGI try to attack it... + htmldata = CGI::unescapeHTML htmldata + + # second pass -- destroy the remaining bits... + htmldata.gsub(/(&(.+?);)/) { + symbol = $2 + + # remove the 0-paddng from unicode integers + if symbol =~ /#(.+)/ + symbol = "##{$1.to_i.to_s}" + end + + # output the symbol's irc-translated character, or a * if it's unknown + UNESCAPE_TABLE[symbol] || '*' + } + end + def get_title_from_html(pagedata) return unless TITLE_RE.match(pagedata) title = $1.strip.gsub(/\s*\n+\s*/, " ") - title = CGI::unescapeHTML title + title = unescape_title title title = title[0..255] if title.length > 255 "[Link Info] title: #{title}" end |