From 5270da00bb7974629a1c0697c0296dbd7b7c992b Mon Sep 17 00:00:00 2001
From: Giuseppe Bilotta <giuseppe.bilotta@gmail.com>
Date: Thu, 27 Aug 2009 21:35:06 +0200
Subject: url plugin: only chop non-word characters on 404

Chopping everything causes long delays for non-existing pages with
long paths. Since the purpose of the retry-with-chop is to get the
right URL when punctuation is added after it, the solution is to
only chop non-word characters. This has to be done on the
unescaped URL because otherwise non-word characters like " that
expand to %22 will not be chopped.
---
 data/rbot/plugins/url.rb | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'data/rbot/plugins')

diff --git a/data/rbot/plugins/url.rb b/data/rbot/plugins/url.rb
index ad895121..56e461d6 100644
--- a/data/rbot/plugins/url.rb
+++ b/data/rbot/plugins/url.rb
@@ -169,9 +169,17 @@ class UrlPlugin < Plugin
         # with the last character stripped. this might generate invalid URIs
         # (e.g. because "some.url" gets chopped to some.url%2, so catch that too
         if e.message =~ /\(404 - Not Found\)/i or e.kind_of?(URI::InvalidURIError)
-          # chop off last character, and retry if we still have enough string to
-          # look like a minimal URL
-          retry if urlstr.chop! and urlstr =~ /^https?:\/\/./
+          # chop off last non-word character from the unescaped version of
+          # the URL, and retry if we still have enough string to look like a
+          # minimal URL
+          unescaped = URI.unescape(urlstr)
+          debug "Unescaped: #{unescaped}"
+          if unescaped.sub!(/\W$/,'') and unescaped =~ /^https?:\/\/./
+            urlstr.replace URI.escape(unescaped, OUR_UNSAFE)
+            retry
+          else
+            debug "Not retrying #{unescaped}"
+          end
         end
         reply = "Error #{e.message}"
       end
-- 
cgit v1.2.3