HTML processing refactoring: HTML title extraction is now a String method

author: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-09-18 06:15:45 +0000
committer: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-09-18 06:15:45 +0000
commit: 2da3a85740963a5dc4e9390115e13139f97511e2 (patch)
tree: 9b8df767c92c1ab1d406e1bad9d832b0b19df801 /lib/rbot
parent: 6b57387fd524539e831fc434f626659d7d07d61c (diff)
2 files changed, 18 insertions, 1 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index e0c781b1..0b07257a 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -178,6 +178,20 @@ class ::String
   def riphtml
     self.gsub(/<[^>]+>/, '').gsub(/&amp;/,'&').gsub(/&quot;/,'"').gsub(/&lt;/,'<').gsub(/&gt;/,'>').gsub(/&ellip;/,'...').gsub(/&apos;/, "'").gsub("\n",'')
   end
+
+  # This method tries to find an HTML title in the string,
+  # and returns it if found
+  def get_html_title
+    return unless Irc::Utils::TITLE_REGEX.match(self)
+    $1
+  end
+
+  # This method returns the IRC-formatted version of an
+  # HTML title found in the string
+  def ircify_html_title
+    return unless Irc::Utils::TITLE_REGEX.match(self)
+    $1.ircify_html
+  end
 end
 
 
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index a4f071a2..0b10b52f 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -317,7 +317,10 @@ rescue LoadError
   else
     module ::Irc
       module Utils
-        # Define some regular expressions to be used by first_html_par
+        # Some regular expressions to manage HTML data
+
+        # Title
+        TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
 
         # H1, H2, etc
         HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-09-18 06:15:45 +0000
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-09-18 06:15:45 +0000
commit	2da3a85740963a5dc4e9390115e13139f97511e2 (patch)
tree	9b8df767c92c1ab1d406e1bad9d832b0b19df801 /lib/rbot
parent	6b57387fd524539e831fc434f626659d7d07d61c (diff)