summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-09-11 08:58:38 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-09-11 08:58:38 +0000
commitde209119bd6b098381db11d9b0de07c7d898c12e (patch)
treecb0c983938f08edcef7e578d91747f4612930131 /lib
parentd9878de310537cec9c9db889b453b36464aafbe2 (diff)
first_html_par: use hpricot if available
Diffstat (limited to 'lib')
-rw-r--r--lib/rbot/core/utils/utils.rb144
1 files changed, 134 insertions, 10 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index fe073c62..9b978ad8 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -303,6 +303,37 @@ rescue LoadError
end
end
+begin
+ require 'htmlentities'
+rescue LoadError
+ gems = nil
+ begin
+ gems = require 'rubygems'
+ rescue LoadError
+ gems = false
+ end
+ if gems
+ retry
+ else
+ module ::Irc
+ module Utils
+ # define some regular expressions to be used for first_html_par
+
+ # H1, H2, etc
+ HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
+ # A paragraph
+ PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
+ # to mark actual text
+ AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
+
+ # At worst, we can try stuff which is comprised between two <br>
+ AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
+ end
+ end
+ end
+end
module ::Irc
@@ -411,16 +442,6 @@ module ::Irc
end
end
- HX_REGEX = /<h(\d)(?:\s+[^>]*)?>(.*?)<\/h\1>/im
- PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
- # Some blogging and forum platforms use spans or divs with a 'body' or 'message' or 'text' in their class
- # to mark actual text
- AFTER_PAR1_REGEX = /<\w+\s+[^>]*(?:body|message|text)[^>]*>.*?<\/?(?:p|div|html|body|table|td|tr)(?:\s+[^>]*)?>/im
-
- # At worst, we can try stuff which is comprised between two <br>
- AFTER_PAR2_REGEX = /<br(?:\s+[^>]*)?\/?>.*?<\/?(?:br|p|div|html|body|table|td|tr)(?:\s+[^>]*)?\/?>/im
-
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
# If possible, grab the one after the first heading
#
@@ -431,6 +452,108 @@ module ::Irc
# * :min_spaces => Minimum number of spaces a paragraph should have
#
def Utils.ircify_first_html_par(xml_org, opts={})
+ if defined? ::Hpricot
+ Utils.ircify_first_html_par_wh(xml_org, opts)
+ else
+ Utils.ircify_first_html_par_woh(xml_org, opts)
+ end
+ end
+
+ # with hpricot
+ def Utils.ircify_first_html_par_wh(xml_org, opts={})
+ doc = Hpricot(xml_org)
+
+ # Strip styles and scripts
+ (doc/"style|script").remove
+
+ debug doc.inspect
+
+ strip = opts[:strip]
+ strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+ min_spaces = opts[:min_spaces] || 8
+ min_spaces = 0 if min_spaces < 0
+
+ txt = String.new
+
+ h = %w{h1 h2 h3 h4 h5 h6}
+ p = %w{p}
+ ar = []
+ h.each { |hx|
+ p.each { |px|
+ ar << "#{hx}~#{px}"
+ }
+ }
+ h_p_css = ar.join("|")
+ debug "css search: #{h_p_css}"
+
+ while true
+ debug "Minimum number of spaces: #{min_spaces}"
+
+ # Initial attempt: <p> that follows <h\d>
+ pre_h = doc/h_p_css
+ debug "Hx: found: #{pre_h.pretty_inspect}"
+ pre_h.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(Hx attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Second natural attempt: just get any <p>
+ pars = doc/"p"
+ debug "par: found: #{pars.pretty_inspect}"
+ pars.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(par attempt) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # Nothing yet ... let's get drastic: we look for non-par elements too,
+ # but only for those that match something that we know is likely to
+ # contain text
+
+ # Some blogging and forum platforms use spans or divs with a 'body' or
+ # 'message' or 'text' in their class to mark actual text. Since we want
+ # the class match to be partial and case insensitive, we collect
+ # the common elements that may have this class and then filter out those
+ # we don't need
+ pars = Hpricot::Elements[]
+ pre_pars = doc/"div|span|td|tr|tbody|table"
+ pre_pars.each { |el|
+ pars.push el if el.class =~ /body|message|text/i
+ }
+ debug "other \#1: found: #{pars.pretty_inspect}"
+
+ pars.each { |p|
+ debug p
+ txt = p.to_html.ircify_html
+ txt.sub!(strip, '') if strip
+ debug "(other attempt \#1) #{txt.inspect} has #{txt.count(" ")} spaces"
+ break unless txt.empty? or txt.count(" ") < min_spaces
+ }
+
+ return txt unless txt.empty? or txt.count(" ") < min_spaces
+
+ # At worst, we can try stuff which is comprised between two <br>
+ # TODO
+
+ debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
+ return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
+ min_spaces /= 2
+ end
+ end
+
+ # without hpricot
+ def Utils.ircify_first_html_par_woh(xml_org, opts={})
xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
strip = opts[:strip]
@@ -502,6 +625,7 @@ module ::Irc
debug "Last candidate #{txt.inspect} has #{txt.count(" ")} spaces"
return txt unless txt.count(" ") < min_spaces
+ break if min_spaces == 0
min_spaces /= 2
end
end