* (httputil) transparent charset support in HTTP::Response

- (extends) encoding functionality moved into httputil
author: Dmitry Kim <dmitry point kim at gmail point com> 2007-04-02 12:48:50 +0000
committer: Dmitry Kim <dmitry point kim at gmail point com> 2007-04-02 12:48:50 +0000
commit: 4258907bfce64f40ca384b6532e47f30615da15b (patch)
tree: 93202531bc22f18dce6546b9e4b861c69dfa0c3f /lib/rbot
parent: b14aff9aee87df83c2aab47c92b58cc7bf74432e (diff)
3 files changed, 64 insertions, 70 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index fa2dff95..bec6e94e 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -27,13 +27,6 @@ class ::Array
   end
 end
 
-begin
-  require 'iconv'
-  $we_have_iconv = true
-rescue LoadError
-  $we_have_iconv = false
-end
-
 # Extensions to the String class
 #
 # TODO make ircify_html() accept an Hash of options, and make riphtml() just
@@ -41,44 +34,6 @@ end
 #
 class ::String
 
-  # This method will try to transcode a String supposed to hold an XML or HTML
-  # document from the original charset to UTF-8.
-  #
-  # To find the original encoding, it will first see if the String responds to
-  # #http_headers(), and if it does it will assume that the charset indicated
-  # there is the correct one. Otherwise, it will try to detect the charset from
-  # some typical XML and HTML headers
-  def utfy_xml
-    return self unless $we_have_iconv
-
-    charset = nil
-
-    if self.respond_to?(:http_headers) and headers = self.http_headers
-      if headers['content-type'].first.match(/charset=(\S+?)\s*(?:;|\Z)/i)
-        debug "charset #{charset} set from header"
-        charset = $1
-      end
-    end
-
-    if not charset
-      case self
-      when /<\?xml.*encoding="(\S+)".*\?>/i
-        charset = $1
-      when /<meta\s+http-equiv\s*=\s*["']?Content-Type["']?.*charset\s*=\s*(\S+?)(?:;|["']|\s).*>/i
-        charset = $1
-      end
-      debug "charset #{charset} set from string"
-    end
-
-    if charset
-      return Iconv.iconv('utf-8', charset, self).join rescue self
-    else
-      debug "Couldn't find charset for #{self.inspect}"
-      return self
-    end
-
-  end
-
   # This method will return a purified version of the receiver, with all HTML
   # stripped off and some of it converted to IRC formatting
   #
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb
index 78ea9063..f0a09364 100644
--- a/lib/rbot/core/utils/httputil.rb
+++ b/lib/rbot/core/utils/httputil.rb
@@ -13,6 +13,7 @@
 
 require 'resolv'
 require 'net/http'
+require 'iconv'
 begin
   require 'net/https'
 rescue LoadError => e
@@ -22,19 +23,65 @@ end
 
 module ::Net 
   class HTTPResponse 
+    attr_accessor :no_cache 
+    if !instance_methods.include?('raw_body')
+      alias :raw_body :body
+    end
+
+    def body_charset(str=self.raw_body)
+      ctype = self['content-type'] || 'text/html'
+      return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i
+
+      charset = 'latin1' # should be in config
+
+      if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i)
+        charset = $1
+        debug "charset #{charset} set from header"
+      end
+
+      case str
+      when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i
+        charset = $1
+        debug "xml charset #{charset} set from xml pi"
+      when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i
+        meta = $1
+        if meta =~ /charset=['"]?([^\s'";]+)['"]?/
+          charset = $1
+          debug "html charset #{charset} set from meta"
+        end
+      end
+      return charset
+    end
+
+    def body_to_utf(str)
+      charset = self.body_charset(str) or return str
+
+      begin
+        return Iconv.iconv('utf-8//ignore', charset, str).first
+      rescue
+        debug "conversion failed"
+        return str
+      end
+    end
+
+    def body
+      return self.body_to_utf(self.raw_body)
+    end
+
     # Read chunks from the body until we have at least _size_ bytes, yielding 
     # the partial text at each chunk. Return the partial body. 
     def partial_body(size=0, &block) 
 
+      self.no_cache = true
       partial = String.new 
 
       self.read_body { |chunk| 
         partial << chunk 
-        yield partial if block_given? 
+        yield self.body_to_utf(partial) if block_given? 
         break if size and size > 0 and partial.length >= size 
       } 
 
-      return partial 
+      return self.body_to_utf(partial)
     end 
   end 
 end
@@ -85,6 +132,7 @@ class HttpUtil
 
     def self.maybe_new(resp)
       debug "maybe new #{resp}"
+      return nil if resp.no_cache
       return nil unless Net::HTTPOK === resp ||
       Net::HTTPMovedPermanently === resp ||
       Net::HTTPFound === resp ||
@@ -160,7 +208,7 @@ class HttpUtil
       @response = resp
       begin
         self.revalidate
-        self.response.body
+        self.response.raw_body
       rescue Exception => e
         error e.message
         error e.backtrace.join("\n")
@@ -298,25 +346,9 @@ class HttpUtil
     if block_given?
       yield(resp)
     else
-      resp.body
+      # Net::HTTP wants us to read the whole body here
+      resp.raw_body
     end
-
-    class << resp.body
-      def http_headers
-        if defined?(@http_headers)
-          @http_headers
-        else
-          nil
-        end
-      end
-
-      def http_headers=(rsp)
-        @http_headers=rsp
-      end
-    end
-
-    resp.body.http_headers = resp.to_hash
-
     return resp
   end
 
@@ -417,9 +449,16 @@ class HttpUtil
         elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp
           debug "http error, deleting cached obj" if cached
           @cache.delete(cache_key)
-        elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil
-          debug "storing to cache"
-          @cache[cache_key] = cached
+        elsif opts[:cache]
+          begin
+            return handle_response(uri, resp, opts, &block)
+          ensure
+            if cached = CachedObject.maybe_new(resp) rescue nil
+              debug "storing to cache"
+              @cache[cache_key] = cached
+            end
+          end
+          return ret
         end
         return handle_response(uri, resp, opts, &block)
       end
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 57f6a934..cd2b9e1a 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -433,7 +433,7 @@ module ::Irc
     #   * :min_spaces => Minimum number of spaces a paragraph should have
     #
     def Utils.ircify_first_html_par(xml_org, opts={})
-      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "").utfy_xml
+      xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
 
       strip = opts[:strip]
       strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)
author	Dmitry Kim <dmitry point kim at gmail point com>	2007-04-02 12:48:50 +0000
committer	Dmitry Kim <dmitry point kim at gmail point com>	2007-04-02 12:48:50 +0000
commit	4258907bfce64f40ca384b6532e47f30615da15b (patch)
tree	93202531bc22f18dce6546b9e4b861c69dfa0c3f /lib/rbot
parent	b14aff9aee87df83c2aab47c92b58cc7bf74432e (diff)