summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAxel Rau <axel.rau@chaos1.de>2014-03-08 20:59:24 +0000
committerJeremy Harris <jgh146exb@wizmail.org>2014-03-08 21:01:55 +0000
commitb9c2e32fc0c80d2ff4afb7daa782d22c5c38a229 (patch)
treeda963e9fc7b912b2600a3f23e3c8d2411a7a630c /src
parentc27ab1df218daf0cdf9bfd0f474175c5f53f0d9c (diff)
${utf8clean:string} expansion operator. Bug 1401
Diffstat (limited to 'src')
-rw-r--r--src/src/expand.c92
1 files changed, 90 insertions, 2 deletions
diff --git a/src/src/expand.c b/src/src/expand.c
index 7e1b32343..ee7b2fdc6 100644
--- a/src/src/expand.c
+++ b/src/src/expand.c
@@ -204,7 +204,8 @@ static uschar *op_table_main[] = {
US"str2b64",
US"strlen",
US"substr",
- US"uc" };
+ US"uc",
+ US"utf8clean" };
enum {
EOP_ADDRESS = sizeof(op_table_underscore)/sizeof(uschar *),
@@ -240,7 +241,8 @@ enum {
EOP_STR2B64,
EOP_STRLEN,
EOP_SUBSTR,
- EOP_UC };
+ EOP_UC,
+ EOP_UTF8CLEAN };
/* Table of condition names, and corresponding switch numbers. The names must
@@ -6206,6 +6208,89 @@ while (*s != 0)
continue;
}
+ /* replace illegal UTF-8 sequences by replacement character */
+
+ #define UTF8_REPLACEMENT_CHAR US"?"
+
+ case EOP_UTF8CLEAN:
+ {
+ int seq_len, index = 0;
+ int bytes_left = 0;
+ uschar seq_buff[4]; /* accumulate utf-8 here */
+
+ while (*sub != 0)
+ {
+ int complete;
+ long codepoint;
+ uschar c;
+
+ complete = 0;
+ c = *sub++;
+ if(bytes_left)
+ {
+ if ((c & 0xc0) != 0x80)
+ {
+ /* wrong continuation byte; invalidate all bytes */
+ complete = 1; /* error */
+ }
+ else
+ {
+ codepoint = (codepoint << 6) | (c & 0x3f);
+ seq_buff[index++] = c;
+ if (--bytes_left == 0) /* codepoint complete */
+ {
+ if(codepoint > 0x10FFFF) /* is it too large? */
+ complete = -1; /* error */
+ else
+ { /* finished; output utf-8 sequence */
+ yield = string_cat(yield, &size, &ptr, seq_buff, seq_len);
+ index = 0;
+ }
+ }
+ }
+ }
+ else /* no bytes left: new sequence */
+ {
+ if((c & 0x80) == 0) /* 1-byte sequence, US-ASCII, keep it */
+ {
+ yield = string_cat(yield, &size, &ptr, &c, 1);
+ continue;
+ }
+ if((c & 0xe0) == 0xc0) /* 2-byte sequence */
+ {
+ bytes_left = 1;
+ codepoint = c & 0x1f;
+ }
+ else if((c & 0xf0) == 0xe0) /* 3-byte sequence */
+ {
+ bytes_left = 2;
+ codepoint = c & 0x0f;
+ }
+ else if((c & 0xf8) == 0xf0) /* 4-byte sequence */
+ {
+ bytes_left = 3;
+ codepoint = c & 0x07;
+ }
+ else /* invalid or too long (RFC3629 allows only 4 bytes) */
+ complete = -1;
+
+ seq_buff[index++] = c;
+ seq_len = bytes_left + 1;
+ } /* if(bytes_left) */
+
+ if (complete != 0)
+ {
+ bytes_left = index = 0;
+ yield = string_cat(yield, &size, &ptr, UTF8_REPLACEMENT_CHAR, 1);
+ }
+ if ((complete == 1) && ((c & 0x80) == 0))
+ { /* ASCII character follows incomplete sequence */
+ yield = string_cat(yield, &size, &ptr, &c, 1);
+ }
+ }
+ continue;
+ }
+
/* escape turns all non-printing characters into escape sequences. */
case EOP_ESCAPE:
@@ -6834,4 +6919,7 @@ return 0;
#endif
+/*
+ vi: aw ai sw=2
+*/
/* End of expand.c */