From adb212bdfc678af04fa438b42ec06047a13a8f2c Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Tue, 18 Sep 2007 17:31:24 +0000 Subject: first_html_par: build lists 'manually' when using Hpricot Hpricot selectors (like doc/"css path") don't return elements in their natural (depth-first) order. Use custom searches from the root of the document to achieve this. --- lib/rbot/core/utils/utils.rb | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 32b05700..9b678def 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -498,25 +498,26 @@ module ::Irc txt = String.new - h = %w{h1 h2 h3 h4 h5 h6} - p = %w{p} - ar = [] - h.each { |hx| - p.each { |px| - ar << "#{hx}~#{px}" - } - } - h_p_css = ar.join("|") - debug "css search: #{h_p_css}" - pre_h = pars = by_span = nil while true debug "Minimum number of spaces: #{min_spaces}" # Initial attempt:

that follows - pre_h = doc/h_p_css if pre_h.nil? - debug "Hx: found: #{pre_h.pretty_inspect}" + if pre_h.nil? + pre_h = Hpricot::Elements[] + found_h = false + doc.root.search("*") { |e| + case e.pathname + when /^h\d/ + found_h = true + when 'p' + pre_h << e if found_h + end + } + debug "Hx: found: #{pre_h.pretty_inspect}" + end + pre_h.each { |p| debug p txt = p.to_html.ircify_html @@ -551,9 +552,8 @@ module ::Irc # we don't need if by_span.nil? by_span = Hpricot::Elements[] - pre_pars = doc/"div|span|td|tr|tbody|table" - pre_pars.each { |el| - by_span.push el if el[:class] =~ /body|message|text/i + doc.root.each("*") { |el| + by_span.push el if el.pathname =~ /^(?:div|span|td|tr|tbody|table)$/ and el[:class] =~ /body|message|text/i } debug "other \#1: found: #{by_span.pretty_inspect}" end -- cgit v1.2.3