summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-09-18 06:15:45 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-09-18 06:15:45 +0000
commit2da3a85740963a5dc4e9390115e13139f97511e2 (patch)
tree9b8df767c92c1ab1d406e1bad9d832b0b19df801
parent6b57387fd524539e831fc434f626659d7d07d61c (diff)
HTML processing refactoring: HTML title extraction is now a String method
-rw-r--r--data/rbot/plugins/url.rb4
-rw-r--r--lib/rbot/core/utils/extends.rb14
-rw-r--r--lib/rbot/core/utils/utils.rb5
3 files changed, 19 insertions, 4 deletions
diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb
index 6e609130..0809288f 100644
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -9,7 +9,6 @@ class ::UrlLinkError < RuntimeError
end
class UrlPlugin < Plugin
- TITLE_RE = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
LINK_INFO = "[Link Info]"
OUR_UNSAFE = Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}%# ]", false, 'N')
@@ -53,8 +52,7 @@ class UrlPlugin < Plugin
end
def get_title_from_html(pagedata)
- return unless TITLE_RE.match(pagedata)
- $1.ircify_html
+ return pagedata.ircify_html_title
end
def get_title_for_url(uri_str, opts = {})
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index e0c781b1..0b07257a 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -178,6 +178,20 @@ class ::String
def riphtml
self.gsub(/<[^>]+>/, '').gsub(/&amp;/,'&').gsub(/&quot;/,'"').gsub(/&lt;/,'<').gsub(/&gt;/,'>').gsub(/&ellip;/,'...').gsub(/&apos;/, "'").gsub("\n",'')
end
+
+ # This method tries to find an HTML title in the string,
+ # and returns it if found
+ def get_html_title
+ return unless Irc::Utils::TITLE_REGEX.match(self)
+ $1
+ end
+
+ # This method returns the IRC-formatted version of an
+ # HTML title found in the string
+ def ircify_html_title
+ return unless Irc::Utils::TITLE_REGEX.match(self)
+ $1.ircify_html
+ end
end
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index a4f071a2..0b10b52f 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -317,7 +317,10 @@ rescue LoadError
else
module ::Irc
module Utils
- # Define some regular expressions to be used by first_html_par
+ # Some regular expressions to manage HTML data
+
+ # Title
+ TITLE_REGEX = /<\s*?title\s*?>(.+?)<\s*?\/title\s*?>/im
# H1, H2, etc
HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im