HTML processing refactoring: HTML title extraction is now a String method

author Giuseppe Bilotta <giuseppe.bilotta@gmail.com>

Tue, 18 Sep 2007 06:15:45 +0000 (06:15 +0000)

committer Giuseppe Bilotta <giuseppe.bilotta@gmail.com>

Tue, 18 Sep 2007 06:15:45 +0000 (06:15 +0000)
author Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Tue, 18 Sep 2007 06:15:45 +0000 (06:15 +0000)
committer Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Tue, 18 Sep 2007 06:15:45 +0000 (06:15 +0000)
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb

index 6e609130b1a17c993cd53c0f6c4630db36c14fb5..0809288f3c260ba5c8558dc1118e824552e3fee6 100644 (file)
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -9,7 +9,6 @@ class ::UrlLinkError < RuntimeError
  end
  
  class UrlPlugin < Plugin
-  TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
    LINK_INFO = "[Link Info]"
    OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
  
@@ -53,8 +52,7 @@ class UrlPlugin < Plugin
    end
  
    def get_title_from_html(pagedata)
-    return unless TITLE_RE.match(pagedata)
-    $1.ircify_html
+    return pagedata.ircify_html_title
    end
  
    def get_title_for_url(uri_str, opts = {})
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb

index e0c781b194cba78910028f156c2ea735b80f1fe0..0b07257a1b58a42b2fe4d98b6da40ba43beb63f7 100644 (file)
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -178,6 +178,20 @@ class ::String
    def riphtml
      self.gsub(/<[^>]+>/, '').gsub(/&amp;/,'&').gsub(/&quot;/,'"').gsub(/&lt;/,'<').gsub(/&gt;/,'>').gsub(/&ellip;/,'...').gsub(/&apos;/, "'").gsub("\n",'')
    end
+
+  # This method tries to find an HTML title in the string,
+  # and returns it if found
+  def get_html_title
+    return unless Irc::Utils::TITLE_REGEX.match(self)
+    $1
+  end
+
+  # This method returns the IRC-formatted version of an
+  # HTML title found in the string
+  def ircify_html_title
+    return unless Irc::Utils::TITLE_REGEX.match(self)
+    $1.ircify_html
+  end
  end
  
  
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb

index a4f071a28a011f1dd8611f994beb9a720dbd0e91..0b10b52fd1ab3efe5dfba6b86187fe989fa86e77 100644 (file)
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -317,7 +317,10 @@ rescue LoadError
    else
      module ::Irc
        module Utils
-        # Define some regular expressions to be used by first_html_par
+        # Some regular expressions to manage HTML data
+
+        # Title
+        TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
  
          # H1, H2, etc
          HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
	Tue, 18 Sep 2007 06:15:45 +0000 (06:15 +0000)
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
	Tue, 18 Sep 2007 06:15:45 +0000 (06:15 +0000)
data/rbot/plugins/url.rb		patch \| blob \| history
lib/rbot/core/utils/extends.rb		patch \| blob \| history
lib/rbot/core/utils/utils.rb		patch \| blob \| history