summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAxel Rau <axel.rau@chaos1.de>2014-03-08 20:59:24 +0000
committerJeremy Harris <jgh146exb@wizmail.org>2014-03-08 21:01:55 +0000
commitb9c2e32fc0c80d2ff4afb7daa782d22c5c38a229 (patch)
treeda963e9fc7b912b2600a3f23e3c8d2411a7a630c
parentc27ab1df218daf0cdf9bfd0f474175c5f53f0d9c (diff)
${utf8clean:string} expansion operator. Bug 1401
-rw-r--r--doc/doc-docbook/spec.xfpt8
-rw-r--r--doc/doc-txt/ChangeLog2
-rw-r--r--doc/doc-txt/NewStuff3
-rw-r--r--src/src/expand.c92
-rw-r--r--test/confs/060069
-rw-r--r--test/log/060018
-rw-r--r--test/mail/0600.CALLER45
-rw-r--r--test/scripts/0000-Basic/060032
-rw-r--r--test/stdout/060012
9 files changed, 279 insertions, 2 deletions
diff --git a/doc/doc-docbook/spec.xfpt b/doc/doc-docbook/spec.xfpt
index 4faf78d90..e4cd6ed30 100644
--- a/doc/doc-docbook/spec.xfpt
+++ b/doc/doc-docbook/spec.xfpt
@@ -10171,6 +10171,14 @@ number of larger units and output in Exim's normal time format, for example,
.cindex "expansion" "case forcing"
.cindex "&%uc%& expansion item"
This forces the letters in the string into upper-case.
+
+.vitem &*${utf8clean*&<&'utf-8 string'&>&*}*&
+.cindex "correction of invalid utf-8 sequences in strings"
+.cindex "utf-8" "utf-8 sequences"
+.cindex "wrong utf-8"
+.cindex "expansion" "utf-8 forcing"
+.cindex "&%utf8clean%& expansion item"
+This replaces any invalid utf-8 sequence in the string by the character &`?`&.
.endlist
diff --git a/doc/doc-txt/ChangeLog b/doc/doc-txt/ChangeLog
index 1ca558e34..83c255c24 100644
--- a/doc/doc-txt/ChangeLog
+++ b/doc/doc-txt/ChangeLog
@@ -46,6 +46,8 @@ TL/05 Rename SPF condition results err_perm and err_temp to standardized
the ACL tests for either of these two results. Patch contributed by
user bes-internal on the mailing list.
+JH/04 Add ${utf8clean:} operator. Contributed by Alex Rau.
+
Exim version 4.82
-----------------
diff --git a/doc/doc-txt/NewStuff b/doc/doc-txt/NewStuff
index dd3e58714..04ac831dc 100644
--- a/doc/doc-txt/NewStuff
+++ b/doc/doc-txt/NewStuff
@@ -19,6 +19,9 @@ Version 4.83
those non-ASCII characters, but downstream apps may not, so Exim can
detect and reject if those characters are present.
+ 3. New expansion operator ${utf8clean:string} to replace malformed UTF8
+ codepoints with valid ones.
+
Version 4.82
------------
diff --git a/src/src/expand.c b/src/src/expand.c
index 7e1b32343..ee7b2fdc6 100644
--- a/src/src/expand.c
+++ b/src/src/expand.c
@@ -204,7 +204,8 @@ static uschar *op_table_main[] = {
US"str2b64",
US"strlen",
US"substr",
- US"uc" };
+ US"uc",
+ US"utf8clean" };
enum {
EOP_ADDRESS = sizeof(op_table_underscore)/sizeof(uschar *),
@@ -240,7 +241,8 @@ enum {
EOP_STR2B64,
EOP_STRLEN,
EOP_SUBSTR,
- EOP_UC };
+ EOP_UC,
+ EOP_UTF8CLEAN };
/* Table of condition names, and corresponding switch numbers. The names must
@@ -6206,6 +6208,89 @@ while (*s != 0)
continue;
}
+ /* replace illegal UTF-8 sequences by replacement character */
+
+ #define UTF8_REPLACEMENT_CHAR US"?"
+
+ case EOP_UTF8CLEAN:
+ {
+ int seq_len, index = 0;
+ int bytes_left = 0;
+ uschar seq_buff[4]; /* accumulate utf-8 here */
+
+ while (*sub != 0)
+ {
+ int complete;
+ long codepoint;
+ uschar c;
+
+ complete = 0;
+ c = *sub++;
+ if(bytes_left)
+ {
+ if ((c & 0xc0) != 0x80)
+ {
+ /* wrong continuation byte; invalidate all bytes */
+ complete = 1; /* error */
+ }
+ else
+ {
+ codepoint = (codepoint << 6) | (c & 0x3f);
+ seq_buff[index++] = c;
+ if (--bytes_left == 0) /* codepoint complete */
+ {
+ if(codepoint > 0x10FFFF) /* is it too large? */
+ complete = -1; /* error */
+ else
+ { /* finished; output utf-8 sequence */
+ yield = string_cat(yield, &size, &ptr, seq_buff, seq_len);
+ index = 0;
+ }
+ }
+ }
+ }
+ else /* no bytes left: new sequence */
+ {
+ if((c & 0x80) == 0) /* 1-byte sequence, US-ASCII, keep it */
+ {
+ yield = string_cat(yield, &size, &ptr, &c, 1);
+ continue;
+ }
+ if((c & 0xe0) == 0xc0) /* 2-byte sequence */
+ {
+ bytes_left = 1;
+ codepoint = c & 0x1f;
+ }
+ else if((c & 0xf0) == 0xe0) /* 3-byte sequence */
+ {
+ bytes_left = 2;
+ codepoint = c & 0x0f;
+ }
+ else if((c & 0xf8) == 0xf0) /* 4-byte sequence */
+ {
+ bytes_left = 3;
+ codepoint = c & 0x07;
+ }
+ else /* invalid or too long (RFC3629 allows only 4 bytes) */
+ complete = -1;
+
+ seq_buff[index++] = c;
+ seq_len = bytes_left + 1;
+ } /* if(bytes_left) */
+
+ if (complete != 0)
+ {
+ bytes_left = index = 0;
+ yield = string_cat(yield, &size, &ptr, UTF8_REPLACEMENT_CHAR, 1);
+ }
+ if ((complete == 1) && ((c & 0x80) == 0))
+ { /* ASCII character follows incomplete sequence */
+ yield = string_cat(yield, &size, &ptr, &c, 1);
+ }
+ }
+ continue;
+ }
+
/* escape turns all non-printing characters into escape sequences. */
case EOP_ESCAPE:
@@ -6834,4 +6919,7 @@ return 0;
#endif
+/*
+ vi: aw ai sw=2
+*/
/* End of expand.c */
diff --git a/test/confs/0600 b/test/confs/0600
new file mode 100644
index 000000000..0347e4c60
--- /dev/null
+++ b/test/confs/0600
@@ -0,0 +1,69 @@
+# Exim test configuration 0005
+
+exim_path = EXIM_PATH
+host_lookup_order = bydns
+rfc1413_query_timeout = 0s
+spool_directory = DIR/spool
+log_file_path = DIR/spool/log/%slog
+gecos_pattern = ""
+gecos_name = CALLER_NAME
+
+# ----- Main settings -----
+
+domainlist local_domains = @
+
+acl_smtp_rcpt = accept
+acl_smtp_data = check_data
+
+trusted_users = CALLER
+
+
+# ----- ACL -----
+
+begin acl
+
+check_data:
+ accept logwrite = \
+ x-test-header-good1: ${utf8clean:$h_x-test-header-good1:}
+ logwrite = \
+ x-test-header-good2: ${utf8clean:$h_x-test-header-good2:}
+ logwrite = \
+ x-test-header-too-short: ${utf8clean:$h_x-test-header-too-short:}
+ logwrite = \
+ x-test-header-too-long: ${utf8clean:$h_x-test-header-too-long:}
+ logwrite = \
+ x-test-header-too-big: ${utf8clean:$h_x-test-header-too-big:}
+
+
+
+# ----- Routers -----
+
+begin routers
+
+fail_remote_domains:
+ driver = redirect
+ domains = ! +local_domains
+ data = :fail: unrouteable mail domain "$domain"
+
+localuser:
+ driver = accept
+ check_local_user
+ transport = local_delivery
+ headers_add = X-local-user: uid=$local_user_uid gid=$local_user_gid
+
+
+# ----- Transports -----
+
+begin transports
+
+local_delivery:
+ driver = appendfile
+ delivery_date_add
+ envelope_to_add
+ file = DIR/test-mail/$local_part
+ headers_add = "X-body-linecount: $body_linecount\n\
+ X-message-linecount: $message_linecount\n\
+ X-received-count: $received_count"
+ return_path_add
+
+# End
diff --git a/test/log/0600 b/test/log/0600
new file mode 100644
index 000000000..8fc8cfc36
--- /dev/null
+++ b/test/log/0600
@@ -0,0 +1,18 @@
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-good2: \303\237\303\274\303\266\303\244\342\202\254\303\234\303\226\303\204\302\264\340\244\221\340\244\225\340\244\234\341\220\201\341\221\214\341\221\225\360\253\235\206\360\253\237\230
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-short: ?.?.?.\303\244-?.-\303\234.?..?.-?.-?..-?.-?.-?.-?.-?..-?..?.
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-long: ?????-\303\244-?????--\303\226-\303\204-\302\264-\340\244\221-\340\244\225-\340\244\234-\341\220\201-\341\221\214-\341\221\225-?????\360\253\237\206
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-big: ?-----\363\200\200\200
+1999-03-02 09:44:33 10HmaX-0005vi-00 <= CALLER@the.local.host.name U=CALLER P=local-smtp S=sss
+1999-03-02 09:44:33 10HmaX-0005vi-00 => CALLER <CALLER@the.local.host.name> R=localuser T=local_delivery
+1999-03-02 09:44:33 10HmaX-0005vi-00 Completed
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-good1:
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-good2:
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-short:
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-long:
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-big:
+1999-03-02 09:44:33 10HmaY-0005vi-00 <= CALLER@the.local.host.name U=CALLER P=local-smtp S=sss
+1999-03-02 09:44:33 10HmaY-0005vi-00 => CALLER <CALLER@the.local.host.name> R=localuser T=local_delivery
+1999-03-02 09:44:33 10HmaY-0005vi-00 Completed
+1999-03-02 09:44:33 Start queue run: pid=pppp
+1999-03-02 09:44:33 End queue run: pid=pppp
diff --git a/test/mail/0600.CALLER b/test/mail/0600.CALLER
new file mode 100644
index 000000000..e9a50054e
--- /dev/null
+++ b/test/mail/0600.CALLER
@@ -0,0 +1,45 @@
+From CALLER@the.local.host.name Tue Mar 02 09:44:33 1999
+Return-path: <CALLER@the.local.host.name>
+Envelope-to: CALLER@the.local.host.name
+Delivery-date: Tue, 2 Mar 1999 09:44:33 +0000
+Received: from CALLER by the.local.host.name with local-smtp (Exim x.yz)
+ (envelope-from <CALLER@the.local.host.name>)
+ id 10HmaX-0005vi-00
+ for CALLER@the.local.host.name; Tue, 2 Mar 1999 09:44:33 +0000
+x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_
+x-test-header-good2: ßüöä€ÜÖÄ´ऑकजᐁᑌᑕ𫝆𫟘
+x-test-header-too-short: ...ä-.-Ü....-.-..-.-.-.-.-..-...
+x-test-header-too-long: -ä---Ö-Ä-´-ऑ-क-ज-ᐁ-ᑌ-ᑕ-𫟆
+x-test-header-too-big: -----󀀀
+Subject: This is a test message.
+Message-Id: <E10HmaX-0005vi-00@the.local.host.name>
+From: CALLER@the.local.host.name
+Date: Tue, 2 Mar 1999 09:44:33 +0000
+X-local-user: uid=CALLER_UID gid=CALLER_GID
+X-body-linecount: 3
+X-message-linecount: 16
+X-received-count: 1
+
+This is a test message.
+It has three lines.
+This is the last line.
+
+From CALLER@the.local.host.name Tue Mar 02 09:44:33 1999
+Return-path: <CALLER@the.local.host.name>
+Envelope-to: CALLER@the.local.host.name
+Delivery-date: Tue, 2 Mar 1999 09:44:33 +0000
+Received: from CALLER by the.local.host.name with local-smtp (Exim x.yz)
+ (envelope-from <CALLER@the.local.host.name>)
+ id 10HmaY-0005vi-00
+ for CALLER@the.local.host.name; Tue, 2 Mar 1999 09:44:33 +0000
+Subject: second
+Message-Id: <E10HmaY-0005vi-00@the.local.host.name>
+From: CALLER@the.local.host.name
+Date: Tue, 2 Mar 1999 09:44:33 +0000
+X-local-user: uid=CALLER_UID gid=CALLER_GID
+X-body-linecount: 1
+X-message-linecount: 9
+X-received-count: 1
+
+This is a second test message.
+
diff --git a/test/scripts/0000-Basic/0600 b/test/scripts/0000-Basic/0600
new file mode 100644
index 000000000..9d5e67b5b
--- /dev/null
+++ b/test/scripts/0000-Basic/0600
@@ -0,0 +1,32 @@
+# ${utf8clean:string}
+#
+# -bs to simple local delivery
+exim -bs -odi
+mail from:CALLER@HOSTNAME
+rcpt to:CALLER@HOSTNAME
+data
+x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_
+x-test-header-good2: ßüöä€ÜÖÄ´ऑकजᐁᑌᑕ𫝆𫟘
+x-test-header-too-short: ...ä-.-Ü....-.-..-.-.-.-.-..-...
+x-test-header-too-long: -ä---Ö-Ä-´-ऑ-क-ज-ᐁ-ᑌ-ᑕ-𫟆
+x-test-header-too-big: -----󀀀
+Subject: This is a test message.
+
+This is a test message.
+It has three lines.
+This is the last line.
+.
+quit
+****
+exim -bs -odi
+mail from:CALLER@HOSTNAME
+rcpt to:CALLER@HOSTNAME
+data
+Subject: second
+
+This is a second test message.
+.
+quit
+****
+exim -q
+****
diff --git a/test/stdout/0600 b/test/stdout/0600
new file mode 100644
index 000000000..2b1941f58
--- /dev/null
+++ b/test/stdout/0600
@@ -0,0 +1,12 @@
+220 the.local.host.name ESMTP Exim x.yz Tue, 2 Mar 1999 09:44:33 +0000
+250 OK
+250 Accepted
+354 Enter message, ending with "." on a line by itself
+250 OK id=10HmaX-0005vi-00
+221 the.local.host.name closing connection
+220 the.local.host.name ESMTP Exim x.yz Tue, 2 Mar 1999 09:44:33 +0000
+250 OK
+250 Accepted
+354 Enter message, ending with "." on a line by itself
+250 OK id=10HmaY-0005vi-00
+221 the.local.host.name closing connection