From 932577fed8264daebdc089a4f4189b1b25fe2e7d Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Thu, 8 Dec 2011 16:20:43 +0100 Subject: Improve first par detection without hpricot --- lib/rbot/core/utils/utils.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib/rbot') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 7fe83410..e3392f1f 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -127,7 +127,7 @@ rescue LoadError # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class # to mark actual text - AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im + AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text|post)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im # At worst, we can try stuff which is comprised between two
AFTER_PAR2_REGEX = /]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im @@ -493,7 +493,11 @@ module ::Irc # HTML first par grabber without hpricot def Utils.ircify_first_html_par_woh(xml_org, opts={}) - xml = xml_org.gsub(//m, '').gsub(/]*)?>.*?<\/script>/im, "").gsub(/]*)?>.*?<\/style>/im, "") + xml = xml_org.gsub(//m, + "").gsub(/]*)?>.*?<\/script>/im, + "").gsub(/]*)?>.*?<\/style>/im, + "").gsub(/]*)?>.*?<\/select>/im, + "") strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) -- cgit v1.2.3