summaryrefslogtreecommitdiff
path: root/lib/rbot
diff options
context:
space:
mode:
authorDmitry Kim <dmitry point kim at gmail point com>2007-04-02 12:48:50 +0000
committerDmitry Kim <dmitry point kim at gmail point com>2007-04-02 12:48:50 +0000
commit4258907bfce64f40ca384b6532e47f30615da15b (patch)
tree93202531bc22f18dce6546b9e4b861c69dfa0c3f /lib/rbot
parentb14aff9aee87df83c2aab47c92b58cc7bf74432e (diff)
* (httputil) transparent charset support in HTTP::Response
- (extends) encoding functionality moved into httputil
Diffstat (limited to 'lib/rbot')
-rw-r--r--lib/rbot/core/utils/extends.rb45
-rw-r--r--lib/rbot/core/utils/httputil.rb87
-rw-r--r--lib/rbot/core/utils/utils.rb2
3 files changed, 64 insertions, 70 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index fa2dff95..bec6e94e 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -27,13 +27,6 @@ class ::Array
end
end
-begin
- require 'iconv'
- $we_have_iconv = true
-rescue LoadError
- $we_have_iconv = false
-end
-
# Extensions to the String class
#
# TODO make ircify_html() accept an Hash of options, and make riphtml() just
@@ -41,44 +34,6 @@ end
#
class ::String
- # This method will try to transcode a String supposed to hold an XML or HTML
- # document from the original charset to UTF-8.
- #
- # To find the original encoding, it will first see if the String responds to
- # #http_headers(), and if it does it will assume that the charset indicated
- # there is the correct one. Otherwise, it will try to detect the charset from
- # some typical XML and HTML headers
- def utfy_xml
- return self unless $we_have_iconv
-
- charset = nil
-
- if self.respond_to?(:http_headers) and headers = self.http_headers
- if headers['content-type'].first.match(/charset=(\S+?)\s*(?:;|\Z)/i)
- debug "charset #{charset} set from header"
- charset = $1
- end
- end
-
- if not charset
- case self
- when /<\?xml.*encoding="(\S+)".*\?>/i
- charset = $1
- when /<meta\s+http-equiv\s*=\s*["']?Content-Type["']?.*charset\s*=\s*(\S+?)(?:;|["']|\s).*>/i
- charset = $1
- end
- debug "charset #{charset} set from string"
- end
-
- if charset
- return Iconv.iconv('utf-8', charset, self).join rescue self
- else
- debug "Couldn't find charset for #{self.inspect}"
- return self
- end
-
- end
-
# This method will return a purified version of the receiver, with all HTML
# stripped off and some of it converted to IRC formatting
#
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb
index 78ea9063..f0a09364 100644
--- a/lib/rbot/core/utils/httputil.rb
+++ b/lib/rbot/core/utils/httputil.rb
@@ -13,6 +13,7 @@
require 'resolv'
require 'net/http'
+require 'iconv'
begin
require 'net/https'
rescue LoadError => e
@@ -22,19 +23,65 @@ end
module ::Net
class HTTPResponse
+ attr_accessor :no_cache
+ if !instance_methods.include?('raw_body')
+ alias :raw_body :body
+ end
+
+ def body_charset(str=self.raw_body)
+ ctype = self['content-type'] || 'text/html'
+ return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i
+
+ charset = 'latin1' # should be in config
+
+ if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i)
+ charset = $1
+ debug "charset #{charset} set from header"
+ end
+
+ case str
+ when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i
+ charset = $1
+ debug "xml charset #{charset} set from xml pi"
+ when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i
+ meta = $1
+ if meta =~ /charset=['"]?([^\s'";]+)['"]?/
+ charset = $1
+ debug "html charset #{charset} set from meta"
+ end
+ end
+ return charset
+ end
+
+ def body_to_utf(str)
+ charset = self.body_charset(str) or return str
+
+ begin
+ return Iconv.iconv('utf-8//ignore', charset, str).first
+ rescue
+ debug "conversion failed"
+ return str
+ end
+ end
+
+ def body
+ return self.body_to_utf(self.raw_body)
+ end
+
# Read chunks from the body until we have at least _size_ bytes, yielding
# the partial text at each chunk. Return the partial body.
def partial_body(size=0, &block)
+ self.no_cache = true
partial = String.new
self.read_body { |chunk|
partial << chunk
- yield partial if block_given?
+ yield self.body_to_utf(partial) if block_given?
break if size and size > 0 and partial.length >= size
}
- return partial
+ return self.body_to_utf(partial)
end
end
end
@@ -85,6 +132,7 @@ class HttpUtil
def self.maybe_new(resp)
debug "maybe new #{resp}"
+ return nil if resp.no_cache
return nil unless Net::HTTPOK === resp ||
Net::HTTPMovedPermanently === resp ||
Net::HTTPFound === resp ||
@@ -160,7 +208,7 @@ class HttpUtil
@response = resp
begin
self.revalidate
- self.response.body
+ self.response.raw_body
rescue Exception => e
error e.message
error e.backtrace.join("\n")
@@ -298,25 +346,9 @@ class HttpUtil
if block_given?
yield(resp)
else
- resp.body
+ # Net::HTTP wants us to read the whole body here
+ resp.raw_body
end
-
- class << resp.body
- def http_headers
- if defined?(@http_headers)
- @http_headers
- else
- nil
- end
- end
-
- def http_headers=(rsp)
- @http_headers=rsp
- end
- end
-
- resp.body.http_headers = resp.to_hash
-
return resp
end
@@ -417,9 +449,16 @@ class HttpUtil
elsif Net::HTTPServerError === resp || Net::HTTPClientError === resp
debug "http error, deleting cached obj" if cached
@cache.delete(cache_key)
- elsif opts[:cache] && cached = CachedObject.maybe_new(resp) rescue nil
- debug "storing to cache"
- @cache[cache_key] = cached
+ elsif opts[:cache]
+ begin
+ return handle_response(uri, resp, opts, &block)
+ ensure
+ if cached = CachedObject.maybe_new(resp) rescue nil
+ debug "storing to cache"
+ @cache[cache_key] = cached
+ end
+ end
+ return ret
end
return handle_response(uri, resp, opts, &block)
end
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 57f6a934..cd2b9e1a 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -433,7 +433,7 @@ module ::Irc
# * :min_spaces => Minimum number of spaces a paragraph should have
#
def Utils.ircify_first_html_par(xml_org, opts={})
- xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "").utfy_xml
+ xml = xml_org.gsub(/<!--.*?-->/m, '').gsub(/<script(?:\s+[^>]*)?>.*?<\/script>/im, "").gsub(/<style(?:\s+[^>]*)?>.*?<\/style>/im, "")
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)