Utils: more first par enhancements

author: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-03-26 12:44:14 +0000
committer: Giuseppe Bilotta <giuseppe.bilotta@gmail.com> 2007-03-26 12:44:14 +0000
commit: cb9a6b2b4f3d5b79e12e97a4ef9e75190803606a (patch)
tree: b9c5e0f8d3b1d32a465cc51f10676458c0659bc9
parent: a3cf806450893638f98096ab96c4c25023bb01c3 (diff)
2 files changed, 18 insertions, 8 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index 7022fb91..0ecf7aa2 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -41,18 +41,24 @@ class ::String
   def ircify_html
     txt = self
 
+    # remove scripts
+    txt.gsub!(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "")
+
+    # remove styles
+    txt.gsub!(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+
     # bold and strong -> bold
-    txt.gsub!(/<\/?(?:b|strong)\s*>/, "#{Bold}")
+    txt.gsub!(/<\/?(?:b|strong)(?:\s+[^>]*)?>/im, "#{Bold}")
 
     # italic, emphasis and underline -> underline
-    txt.gsub!(/<\/?(?:i|em|u)\s*>/, "#{Underline}")
+    txt.gsub!(/<\/?(?:i|em|u)(?:\s+[^>]*)?>/im, "#{Underline}")
 
     ## This would be a nice addition, but the results are horrible
     ## Maybe make it configurable?
     # txt.gsub!(/<\/?a( [^>]*)?>/, "#{Reverse}")
 
     # Paragraph and br tags are converted to whitespace
-    txt.gsub!(/<\/?(p|br)\s*\/?\s*>/, ' ')
+    txt.gsub!(/<\/?(p|br)(?:\s+[^>]*)?\s*\/?\s*>/, ' ')
     txt.gsub!("\n", ' ')
     txt.gsub!("\r", ' ')
 
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index f2918067..e1d61039 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -440,12 +440,12 @@ module ::Irc
     HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
     PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 
-    # Some blogging and forum platforms use spans or divs with a 'body' in their class
+    # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
     # to mark actual text
-    AFTER_PAR1_REGEX = /<\w+\s+[^>]*body[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+    AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
 
     # At worst, we can try stuff which is comprised between two <br>
-    AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+    AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
 
     # Try to grab and IRCify the first HTML par (<p> tag) in the given string.
     # If possible, grab the one after the first heading
@@ -456,8 +456,8 @@ module ::Irc
     #               text
     #   * :min_spaces => Minimum number of spaces a paragraph should have
     #
-    def Utils.ircify_first_html_par(xml, opts={})
-      txt = String.new
+    def Utils.ircify_first_html_par(xml_org, opts={})
+      xml = xml_org.gsub(/<!--.*?-->/, '')
 
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
@@ -465,6 +465,8 @@ module ::Irc
       min_spaces = opts[:min_spaces] || 8
       min_spaces = 0 if min_spaces < 0
 
+      txt = String.new
+
       while true
         debug "Minimum number of spaces: #{min_spaces}"
         header_found = xml.match(HX_REGEX)
@@ -511,6 +513,8 @@ module ::Irc
           debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
         end
 
+        return txt unless txt.empty? or txt.count(" ") < min_spaces
+
         # Attempt #2
         header_found = xml
         while txt.empty? or txt.count(" ") < min_spaces
author	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-03-26 12:44:14 +0000
committer	Giuseppe Bilotta <giuseppe.bilotta@gmail.com>	2007-03-26 12:44:14 +0000
commit	cb9a6b2b4f3d5b79e12e97a4ef9e75190803606a (patch)
tree	b9c5e0f8d3b1d32a465cc51f10676458c0659bc9
parent	a3cf806450893638f98096ab96c4c25023bb01c3 (diff)