From 93876c8804eb04a1ee7a43943d6a07c34ff6f0fc Mon Sep 17 00:00:00 2001
From: Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Date: Tue, 18 Sep 2007 23:40:44 +0000
Subject: first_html_par: after-paragraph matches should prefer divs and spans
 to other elements

---
 lib/rbot/core/utils/utils.rb | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 0582cd4b..1b6a0ce9 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -307,7 +307,8 @@ begin
   require 'hpricot'
   module ::Irc
     module Utils
-      AFTER_PAR_PATH = /^(?:div|span|td|tr|tbody|table)$/
+      AFTER_PAR_PATH = /^(?:div|span)$/
+      AFTER_PAR_EX = /^(?:td|tr|tbody|table)$/
       AFTER_PAR_CLASS = /body|message|text/i
     end
   end
@@ -556,13 +557,23 @@ module ::Irc
         # 'message' or 'text' in their class to mark actual text. Since we want
         # the class match to be partial and case insensitive, we collect
         # the common elements that may have this class and then filter out those
-        # we don't need
+        # we don't need. If no divs or spans are found, we'll accept additional
+        # elements too (td, tr, tbody, table).
         if by_span.nil?
           by_span = Hpricot::Elements[]
+          extra = Hpricot::Elements[]
           doc.search("*") { |el|
             next if el.bogusetag?
-            by_span.push el if el.pathname =~ AFTER_PAR_PATH and (el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS)
+            case el.pathname
+            when AFTER_PAR_PATH
+              by_span.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+            when AFTER_PAR_EX
+              extra.push el if el[:class] =~ AFTER_PAR_CLASS or el[:id] =~ AFTER_PAR_CLASS
+            end
           }
+          if by_span.empty? and not extra.empty?
+            by_span.concat extra
+          end
           debug "other \#1: found: #{by_span.pretty_inspect}"
         end
 
-- 
cgit v1.2.3