summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-03-25 18:16:36 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-03-25 18:16:36 +0000
commitc513b0227a88b441500581cff9e7f3f954830d2e (patch)
tree9db776532721aa7f9d5a1f1927a6a2ab764e744d
parentbc9e991b8665fdd8f77a257c5381cf70d015a6ec (diff)
Utils: when looking for the first par in a web page, look after any header, not just h1; also, be stricter on what's included in a paragraph
-rw-r--r--lib/rbot/core/utils/utils.rb8
1 files changed, 4 insertions, 4 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index bd35d8d0..cf16b601 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -431,10 +431,10 @@ module ::Irc
end
end
- H1_REGEX = /<h1(?:\s+[^>]*)?>(.*?)<\/h1>/im
- PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
+ HX_REGEX = /<h(\d)(?:\s+[^>]*)?>.*?<\/h\1>/im
+ PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/?(?:p|(?:div|html|body|table|td|tr)(?:\s+[^>]*)?)>/im
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
- # If possible, grab the one after the first h1 heading
+ # If possible, grab the one after the first heading
#
# It is possible to pass some options to determine how the stripping
# occurs. Currently, only one option is supported:
@@ -446,7 +446,7 @@ module ::Irc
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
- header_found = xml.match(H1_REGEX)
+ header_found = xml.match(HX_REGEX)
if header_found
header_found = $'
debug "Found header: #{header_found[1].inspect}"