summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-02-06 17:36:43 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-02-06 17:36:43 +0000
commitbf8325c9a59667474940566065590b8da3dec85d (patch)
treeadaf23c4299f882002cfa1790c59a622793bba60 /lib
parentf5abcb7eff07f436904ad5d88c0651e197c5914c (diff)
Enhance Utils tools to get first pars, allowing an option to strip an initial part of the paragraphs extracted
Diffstat (limited to 'lib')
-rw-r--r--lib/rbot/core/utils/utils.rb23
1 files changed, 17 insertions, 6 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index f5a6c1db..4613dada 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -420,18 +420,30 @@ module ::Irc
PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im
# Try to grab and IRCify the first HTML par (<p> tag) in the given string.
# If possible, grab the one after the first h1 heading
- def Utils.ircify_first_html_par(xml)
- header_found = xml.match(H1_REGEX)
+ #
+ # It is possible to pass some options to determine how the stripping
+ # occurs. Currently, only one option is supported:
+ # * :strip => Regex or String to strip at the beginning of the obtained
+ # text
+ #
+ def Utils.ircify_first_html_par(xml, opts={})
txt = String.new
+ strip = opts[:strip]
+ strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
+
+ header_found = xml.match(H1_REGEX)
if header_found
+ header_found = $'
debug "Found header: #{header_found[1].inspect}"
while txt.empty?
- header_found = $'
candidate = header_found[PAR_REGEX]
break unless candidate
txt = candidate.ircify_html
+ header_found = $'
+ txt.sub!(strip, '') if strip
end
end
+
# If we haven't found a first par yet, try to get it from the whole
# document
if txt.empty?
@@ -441,6 +453,7 @@ module ::Irc
break unless candidate
txt = candidate.ircify_html
header_found = $'
+ txt.sub!(strip, '') if strip
end
end
return txt
@@ -464,9 +477,7 @@ module ::Irc
debug "Unable to retrieve #{url}"
next
end
- debug "Retrieved #{url}"
- debug "\t#{xml}"
- par = Utils.ircify_first_html_par(xml)
+ par = Utils.ircify_first_html_par(xml, opts)
if par.empty?
debug "No first par found\n#{xml}"
# FIXME only do this if the 'url' plugin is loaded