summaryrefslogtreecommitdiff
path: root/lib/rbot
diff options
context:
space:
mode:
authorGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-04-05 20:24:23 +0000
committerGiuseppe Bilotta <giuseppe.bilotta@gmail.com>2007-04-05 20:24:23 +0000
commit4e3660831d7f4fbfe58341e9ce95bef620f13d6b (patch)
treef4565647c778c5cf2d779ebe10bdb8c6aabe7340 /lib/rbot
parent56889ab64a8b60dba2a4aaaf876386841213b14a (diff)
HttpUtil: try all detected charsets when converting a webpage, until one that works is found
Diffstat (limited to 'lib/rbot')
-rw-r--r--lib/rbot/core/utils/httputil.rb32
1 files changed, 17 insertions, 15 deletions
diff --git a/lib/rbot/core/utils/httputil.rb b/lib/rbot/core/utils/httputil.rb
index f0a09364..476a71c1 100644
--- a/lib/rbot/core/utils/httputil.rb
+++ b/lib/rbot/core/utils/httputil.rb
@@ -32,36 +32,38 @@ module ::Net
ctype = self['content-type'] || 'text/html'
return nil unless ctype =~ /^text/i || ctype =~ /x(ht)?ml/i
- charset = 'latin1' # should be in config
+ charsets = ['latin1'] # should be in config
if self['content-type'].match(/charset=["']?([^\s"']+)["']?/i)
- charset = $1
- debug "charset #{charset} set from header"
+ charsets << $1
+ debug "charset #{charsets.last} added from header"
end
case str
when /<\?xml\s[^>]*encoding=['"]([^\s"'>]+)["'][^>]*\?>/i
- charset = $1
- debug "xml charset #{charset} set from xml pi"
+ charsets << $1
+ debug "xml charset #{charsets.last} added from xml pi"
when /<(meta\s[^>]*http-equiv=["']?Content-Type["']?[^>]*)>/i
meta = $1
if meta =~ /charset=['"]?([^\s'";]+)['"]?/
- charset = $1
- debug "html charset #{charset} set from meta"
+ charsets << $1
+ debug "html charset #{charsets.last} added from meta"
end
end
- return charset
+ return charsets.uniq
end
def body_to_utf(str)
- charset = self.body_charset(str) or return str
+ charsets = self.body_charset(str) or return str
- begin
- return Iconv.iconv('utf-8//ignore', charset, str).first
- rescue
- debug "conversion failed"
- return str
- end
+ charsets.reverse_each { |charset|
+ begin
+ return Iconv.iconv('utf-8//ignore', charset, str).first
+ rescue
+ debug "conversion failed for #{charset}"
+ end
+ }
+ return str
end
def body