summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-04-01 16:46:05 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-04-01 16:46:05 +0000
commit8d51c4a1a5a75e8e660f85cce37efcdf993500af (patch)
tree8686a777529508c92fdcc16bde45b6daf5d2fe94
parent84e53ad77fae1fd7e924986c5f36a04115e13ffc (diff)
Stringlib/rbot/core/utils/extends.rbutfy_xml() method that tries to transcode a webpage to UTF-8; HTTP headers are attached to bodies returned by HttpUtil methods to ease charset detection
-rw-r--r--lib/rbot/core/utils/extends.rb43
-rw-r--r--lib/rbot/core/utils/httputil.rb16
-rw-r--r--lib/rbot/core/utils/utils.rb2
3 files changed, 60 insertions, 1 deletions
diff --git a/lib/rbot/core/utils/extends.rb b/lib/rbot/core/utils/extends.rb
index 0ecf7aa2..95569b71 100644
--- a/lib/rbot/core/utils/extends.rb
+++ b/lib/rbot/core/utils/extends.rb
@@ -27,6 +27,12 @@ class ::Array
end
end
+begin
+ require 'iconv'
+ $we_have_iconv = true
+rescue LoadError
+ $we_have_iconv = false
+end
# Extensions to the String class
#
@@ -35,6 +41,43 @@ end
#
class ::String
+ # This method will try to transcode a String supposed to hold an XML or HTML
+ # document from the original charset to UTF-8.
+ #
+ # To find the original encoding, it will first see if the String responds to
+ # #http_headers(), and if it does it will assume that the charset indicated
+ # there is the correct one. Otherwise, it will try to detect the charset from
+ # some typical XML and HTML headers
+ def utfy_xml
+ return self unless $we_have_iconv
+
+ charset = nil
+
+ if self.respond_to?(:http_headers) and headers = self.http_headers
+ if headers['content-type'].first.match(/charset="?(\S+?)"?\s*;?/i)
+ charset = $1
+ end
+ end
+
+ if not charset
+ case self
+ when /<\?xml.*encoding="(\S+)".*\?>/i
+ charset = $1
+ when /<meta\s+http-equiv\s*=\s*"Content-Type".*charset\s*=\s*"?(\S+?)"?\s*;?/i
+ charset = $1
+ end
+ end
+
+ if charset
+ debug "charset: #{charset}"
+ return Iconv.iconv('utf-8', charset, self).join rescue self
+ else
+ debug "Couldn't find charset for #{self.inspect}"
+ return self
+ end
+
+ end
+
# This method will return a purified version of the receiver, with all HTML
# stripped off and some of it converted to IRC formatting
#
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb
index 78445abe..78ea9063 100644
--- a/lib/rbot/core/utils/httputil.rb
+++ b/lib/rbot/core/utils/httputil.rb
@@ -301,6 +301,22 @@ class HttpUtil
resp.body
end
+ class << resp.body
+ def http_headers
+ if defined?(@http_headers)
+ @http_headers
+ else
+ nil
+ end
+ end
+
+ def http_headers=(rsp)
+ @http_headers=rsp
+ end
+ end
+
+ resp.body.http_headers = resp.to_hash
+
return resp
end
diff --git a/lib/rbot/core/utils/utils.rb b/lib/rbot/core/utils/utils.rb
index 717630e3..63cd58da 100644
--- a/lib/rbot/core/utils/utils.rb
+++ b/lib/rbot/core/utils/utils.rb
@@ -433,7 +433,7 @@ module ::Irc
# * :min_spaces => Minimum number of spaces a paragraph should have
#
def Utils.ircify_first_html_par(xml_org, opts={})
- xml = xml_org.gsub(/<!--.*?-->/, '')
+ xml = xml_org.gsub(/<!--.*?-->/, '').utfy_xml
strip = opts[:strip]
strip = Regexp.new(/^#{Regexp.escape(strip)}/) if strip.kind_of?(String)