diff options
author | Dmitry Kim <dmitry point kim at gmail point com> | 2007-04-02 12:48:50 +0000 |
---|---|---|
committer | Dmitry Kim <dmitry point kim at gmail point com> | 2007-04-02 12:48:50 +0000 |
commit | 4258907bfce64f40ca384b6532e47f30615da15b (patch) | |
tree | 93202531bc22f18dce6546b9e4b861c69dfa0c3f /lib/rbot | |
parent | b14aff9aee87df83c2aab47c92b58cc7bf74432e (diff) |
* (httputil) transparent charset support in HTTP::Response
- (extends) encoding functionality moved into httputil
Diffstat (limited to 'lib/rbot')
-rw-r--r-- | lib/rbot/core/utils/extends.rb | 45 | ||||
-rw-r--r-- | lib/rbot/core/utils/httputil.rb | 87 | ||||
-rw-r--r-- | lib/rbot/core/utils/utils.rb | 2 |
3 files changed, 64 insertions, 70 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb index fa2dff95..bec6e94e 100644 --- a/lib/rbot/core/utils/extends.rb +++ b/lib/rbot/core/utils/extends.rb @@ -27,13 +27,6 @@ class ::Array end end -begin - require 'iconv' - $we_have_iconv = true -rescue LoadError - $we_have_iconv = false -end - # Extensions to the String class # # TODO make ircify_html() accept an Hash of options, and make riphtml() just @@ -41,44 +34,6 @@ end # class ::String - # This method will try to transcode a String supposed to hold an XML or HTML - # document from the original charset to UTF-8. - # - # To find the original encoding, it will first see if the String responds to - # #http_headers(), and if it does it will assume that the charset indicated - # there is the correct one. Otherwise, it will try to detect the charset from - # some typical XML and HTML headers - def utfy_xml - return self unless $we_have_iconv - - charset = nil - - if self.respond_to?(:http_headers) and headers = self.http_headers - if headers['content-type'].first.match(/charset=(\S+?)\s*(?:;|\Z)/i) - debug "charset #{charset} set from header" - charset = $1 - end - end - - if not charset - case self - when /<\?xml.*encoding="(\S+)".*\?>/i - charset = $1 - when /<meta\s+http-equiv\s*=\s*["']?Content-Type["']?.*charset\s*=\s*(\S+?)(?:;|["']|\s).*>/i - charset = $1 - end - debug "charset #{charset} set from string" - end - - if charset - return Iconv.iconv('utf-8', charset, self).join rescue self - else - debug "Couldn't find charset for #{self.inspect}" - return self - end - - end - # This method will return a purified version of the receiver, with all HTML # stripped off and some of it converted to IRC formatting # diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb index 78ea9063..f0a09364 100644 --- a/lib/rbot/core/utils/httputil.rb +++ b/lib/rbot/core/utils/httputil.rb @@ -13,6 +13,7 @@ require 'resolv' require 'net/http' +require 'iconv' begin require 'net/https' rescue LoadError => e @@ -22,19 +23,65 @@ end module ::Net class HTTPResponse + attr_accessor :no_cache + if !instance_methods.include?('raw_body') + alias :raw_body :body + end + + def body_charset(str=self.raw_body) + ctype = self['content-type'] || 'text/html' + return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i + + charset = 'latin1' # should be in config + + if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i) + charset = $1 + debug "charset #{charset} set from header" + end + + case str + when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i + charset = $1 + debug "xml charset #{charset} set from xml pi" + when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i + meta = $1 + if meta =~ /charset=['"]?([^\s'";]+)['"]?/ + charset = $1 + debug "html charset #{charset} set from meta" + end + end + return charset + end + + def body_to_utf(str) + charset = self.body_charset(str) or return str + + begin + return Iconv.iconv('utf-8//ignore', charset, str).first + rescue + debug "conversion failed" + return str + end + end + + def body + return self.body_to_utf(self.raw_body) + end + # Read chunks from the body until we have at least _size_ bytes, yielding # the partial text at each chunk. Return the partial body. def partial_body(size=0, &block) + self.no_cache = true partial = String.new self.read_body { |chunk| partial << chunk - yield partial if block_given? + yield self.body_to_utf(partial) if block_given? break if size and size > 0 and partial.length >= size } - return partial + return self.body_to_utf(partial) end end end @@ -85,6 +132,7 @@ class HttpUtil def self.maybe_new(resp) debug "maybe new #{resp}" + return nil if resp.no_cache return nil unless Net::HTTPOK === resp || Net::HTTPMovedPermanently === resp || Net::HTTPFound === resp || @@ -160,7 +208,7 @@ class HttpUtil @response = resp begin self.revalidate - self.response.body + self.response.raw_body rescue Exception => e error e.message error e.backtrace.join("\n") @@ -298,25 +346,9 @@ class HttpUtil if block_given? yield(resp) else - resp.body + # Net::HTTP wants us to read the whole body here + resp.raw_body end - - class << resp.body - def http_headers - if defined?(@http_headers) - @http_headers - else - nil - end - end - - def http_headers=(rsp) - @http_headers=rsp - end - end - - resp.body.http_headers = resp.to_hash - return resp end @@ -417,9 +449,16 @@ class HttpUtil elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp debug "http error, deleting cached obj" if cached @cache.delete(cache_key) - elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil - debug "storing to cache" - @cache[cache_key] = cached + elsif opts[:cache] + begin + return handle_response(uri, resp, opts, &block) + ensure + if cached = CachedObject.maybe_new(resp) rescue nil + debug "storing to cache" + @cache[cache_key] = cached + end + end + return ret end return handle_response(uri, resp, opts, &block) end diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb index 57f6a934..cd2b9e1a 100644 --- a/lib/rbot/core/utils/utils.rb +++ b/lib/rbot/core/utils/utils.rb @@ -433,7 +433,7 @@ module ::Irc # * :min_spaces => Minimum number of spaces a paragraph should have # def Utils.ircify_first_html_par(xml_org, opts={}) - xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "").utfy_xml + xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "") strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) |