From 5270da00bb7974629a1c0697c0296dbd7b7c992b Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Thu, 27 Aug 2009 21:35:06 +0200 Subject: url plugin: only chop non-word characters on 404 Chopping everything causes long delays for non-existing pages with long paths. Since the purpose of the retry-with-chop is to get the right URL when punctuation is added after it, the solution is to only chop non-word characters. This has to be done on the unescaped URL because otherwise non-word characters like " that expand to %22 will not be chopped. --- data/rbot/plugins/url.rb | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb index ad895121..56e461d6 100644 --- a/data/rbot/plugins/url.rb +++ b/data/rbot/plugins/url.rb @@ -169,9 +169,17 @@ class UrlPlugin < Plugin # with the last character stripped. this might generate invalid URIs # (e.g. because "some.url" gets chopped to some.url%2, so catch that too if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError) - # chop off last character, and retry if we still have enough string to - # look like a minimal URL - retry if urlstr.chop! and urlstr =~ /^https?:\/\/./ + # chop off last non-word character from the unescaped version of + # the URL, and retry if we still have enough string to look like a + # minimal URL + unescaped = URI.unescape(urlstr) + debug "Unescaped: #{unescaped}" + if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./ + urlstr.replace URI.escape(unescaped, OUR_UNSAFE) + retry + else + debug "Not retrying #{unescaped}" + end end reply = "Error #{e.message}" end -- cgit v1.2.3