summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2011-12-08 16:20:43 +0100
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2011-12-08 16:20:43 +0100
commit932577fed8264daebdc089a4f4189b1b25fe2e7d (patch)
tree1ac17fcd3715ab6db7d13539f6bf74fe553b3baf
parent1336c5da256161ae2c1468f3d3e67fdb4acc3151 (diff)
Improve first par detection without hpricot
-rw-r--r--lib/rbot/core/utils/utils.rb8
1 files changed, 6 insertions, 2 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 7fe83410..e3392f1f 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -127,7 +127,7 @@ rescue LoadError
# Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
# to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
# At worst, we can try stuff which is comprised between two <br>
AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
@@ -493,7 +493,11 @@ module ::Irc
# HTML first par grabber without hpricot
def Utils.ircify_first_html_par_woh(xml_org, opts={})
- xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
+ xml = xml_org.gsub(/<!--.*?-->/m,
+ "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im,
+ "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im,
+ "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im,
+ "")
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)