From 2da3a85740963a5dc4e9390115e13139f97511e2 Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Tue, 18 Sep 2007 06:15:45 +0000 Subject: HTML processing refactoring: HTML title extraction is now a String method --- data/rbot/plugins/url.rb | 4 +--- lib/rbot/core/utils/extends.rb | 14 ++++++++++++++ lib/rbot/core/utils/utils.rb | 5 ++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index 6e609130..0809288f 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -9,7 +9,6 @@ class ::UrlLinkError < RuntimeError end class UrlPlugin < Plugin - TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im LINK_INFO = "[Link Info]" OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N') @@ -53,8 +52,7 @@ class UrlPlugin < Plugin end def get_title_from_html(pagedata) - return unless TITLE_RE.match(pagedata) - $1.ircify_html + return pagedata.ircify_html_title end def get_title_for_url(uri_str, opts = {}) diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb index e0c781b1..0b07257a 100644 --- a/lib/rbot/core/utils/extends.rb +++ b/lib/rbot/core/utils/extends.rb @@ -178,6 +178,20 @@ class ::String def riphtml self.gsub(/<[^>]+>/, '').gsub(/&/,'&').gsub(/"/,'"').gsub(/</,'<').gsub(/>/,'>').gsub(/&ellip;/,'...').gsub(/'/, "'").gsub("\n",'') end + + # This method tries to find an HTML title in the string, + # and returns it if found + def get_html_title + return unless Irc::Utils::TITLE_REGEX.match(self) + $1 + end + + # This method returns the IRC-formatted version of an + # HTML title found in the string + def ircify_html_title + return unless Irc::Utils::TITLE_REGEX.match(self) + $1.ircify_html + end end diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index a4f071a2..0b10b52f 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -317,7 +317,10 @@ rescue LoadError else module ::Irc module Utils - # Define some regular expressions to be used by first_html_par + # Some regular expressions to manage HTML data + + # Title + TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im # H1, H2, etc HX_REGEX = /]*)?>(.*?)<\/h\1>/im -- cgit v1.2.3