From 93876c8804eb04a1ee7a43943d6a07c34ff6f0fc Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Tue, 18 Sep 2007 23:40:44 +0000 Subject: first_html_par: after-paragraph matches should prefer divs and spans to other elements --- lib/rbot/core/utils/utils.rb | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 0582cd4b..1b6a0ce9 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -307,7 +307,8 @@ begin require 'hpricot' module ::Irc module Utils - AFTER_PAR_PATH = /^(?:div|span|td|tr|tbody|table)$/ + AFTER_PAR_PATH = /^(?:div|span)$/ + AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/ AFTER_PAR_CLASS = /body|message|text/i end end @@ -556,13 +557,23 @@ module ::Irc # 'message' or 'text' in their class to mark actual text. Since we want # the class match to be partial and case insensitive, we collect # the common elements that may have this class and then filter out those - # we don't need + # we don't need. If no divs or spans are found, we'll accept additional + # elements too (td, tr, tbody, table). if by_span.nil? by_span = Hpricot::Elements[] + extra = Hpricot::Elements[] doc.search("*") { |el| next if el.bogusetag? - by_span.push el if el.pathname =~ AFTER_PAR_PATH and (el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS) + case el.pathname + when AFTER_PAR_PATH + by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS + when AFTER_PAR_EX + extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS + end } + if by_span.empty? and not extra.empty? + by_span.concat extra + end debug "other \#1: found: #{by_span.pretty_inspect}" end -- cgit v1.2.3