diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2011-12-08 16:20:43 +0100 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2011-12-08 16:20:43 +0100 |
commit | 932577fed8264daebdc089a4f4189b1b25fe2e7d (patch) | |
tree | 1ac17fcd3715ab6db7d13539f6bf74fe553b3baf | |
parent | 1336c5da256161ae2c1468f3d3e67fdb4acc3151 (diff) |
Improve first par detection without hpricot
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 8 |
1 files changed, 6 insertions, 2 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 7fe83410..e3392f1f 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -127,7 +127,7 @@ rescue LoadError # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class # to mark actual text - AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # At worst, we can try stuff which is comprised between two <br> AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im @@ -493,7 +493,11 @@ module ::Irc # HTML first par grabber without hpricot def Utils.ircify_first_html_par_woh(xml_org, opts={}) - xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "") + xml = xml_org.gsub(/<!--.*?-->/m, + "").gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, + "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, + "").gsub(/<select(?:\s+[^>]*)?>.*?<\/select>/im, + "") strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) |