From 8d51c4a1a5a75e8e660f85cce37efcdf993500af Mon Sep 17 00:00:00 2001 From: Giuseppe Bilotta Date: Sun, 1 Apr 2007 16:46:05 +0000 Subject: Stringlib/rbot/core/utils/extends.rbutfy_xml() method that tries to transcode a webpage to UTF-8; HTTP headers are attached to bodies returned by HttpUtil methods to ease charset detection --- lib/rbot/core/utils/extends.rb | 43 +++++++++++++++++++++++++++++++++++++++++ lib/rbot/core/utils/httputil.rb | 16 +++++++++++++++ lib/rbot/core/utils/utils.rb | 2 +- 3 files changed, 60 insertions(+), 1 deletion(-) (limited to 'lib/rbot') diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb index 0ecf7aa2..95569b71 100644 --- a/lib/rbot/core/utils/extends.rb +++ b/lib/rbot/core/utils/extends.rb @@ -27,6 +27,12 @@ class ::Array end end +begin + require 'iconv' + $we_have_iconv = true +rescue LoadError + $we_have_iconv = false +end # Extensions to the String class # @@ -35,6 +41,43 @@ end # class ::String + # This method will try to transcode a String supposed to hold an XML or HTML + # document from the original charset to UTF-8. + # + # To find the original encoding, it will first see if the String responds to + # #http_headers(), and if it does it will assume that the charset indicated + # there is the correct one. Otherwise, it will try to detect the charset from + # some typical XML and HTML headers + def utfy_xml + return self unless $we_have_iconv + + charset = nil + + if self.respond_to?(:http_headers) and headers = self.http_headers + if headers['content-type'].first.match(/charset="?(\S+?)"?\s*;?/i) + charset = $1 + end + end + + if not charset + case self + when /<\?xml.*encoding="(\S+)".*\?>/i + charset = $1 + when / Minimum number of spaces a paragraph should have # def Utils.ircify_first_html_par(xml_org, opts={}) - xml = xml_org.gsub(//, '') + xml = xml_org.gsub(//, '').utfy_xml strip = opts[:strip] strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String) -- cgit v1.2.3