diff options
author | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-02-06 17:36:43 +0000 |
---|---|---|
committer | Giuseppe Bilotta <giuseppe.bilotta@gmail.com> | 2007-02-06 17:36:43 +0000 |
commit | bf8325c9a59667474940566065590b8da3dec85d (patch) | |
tree | adaf23c4299f882002cfa1790c59a622793bba60 /lib | |
parent | f5abcb7eff07f436904ad5d88c0651e197c5914c (diff) |
Enhance Utils tools to get first pars, allowing an option to strip an initial part of the paragraphs extracted
Diffstat (limited to 'lib')
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index f5a6c1db..4613dada 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -420,18 +420,30 @@ module ::Irc PAR_REGEX = /<p(?:\s+[^>]*)?>.*?<\/p>/im # Try to grab and IRCify the first HTML par (<p> tag) in the given string. # If possible, grab the one after the first h1 heading - def Utils.ircify_first_html_par(xml) - header_found = xml.match(H1_REGEX) + # + # It is possible to pass some options to determine how the stripping + # occurs. Currently, only one option is supported: + # * :strip => Regex or String to strip at the beginning of the obtained + # text + # + def Utils.ircify_first_html_par(xml, opts={}) txt = String.new + strip = opts[:strip] + strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) + + header_found = xml.match(H1_REGEX) if header_found + header_found = $' debug "Found header: #{header_found[1].inspect}" while txt.empty? - header_found = $' candidate = header_found[PAR_REGEX] break unless candidate txt = candidate.ircify_html + header_found = $' + txt.sub!(strip, '') if strip end end + # If we haven't found a first par yet, try to get it from the whole # document if txt.empty? @@ -441,6 +453,7 @@ module ::Irc break unless candidate txt = candidate.ircify_html header_found = $' + txt.sub!(strip, '') if strip end end return txt @@ -464,9 +477,7 @@ module ::Irc debug "Unable to retrieve #{url}" next end - debug "Retrieved #{url}" - debug "\t#{xml}" - par = Utils.ircify_first_html_par(xml) + par = Utils.ircify_first_html_par(xml, opts) if par.empty? debug "No first par found\n#{xml}" # FIXME only do this if the 'url' plugin is loaded |