From bbf472cb954ce729fba5804c6c49faa473a10748 Mon Sep 17 00:00:00 2001 From: peavey Date: Wed, 11 Feb 2009 10:51:18 +0000 Subject: Patch by Phoenix * adds line 8 for additional UTF-8 ranges * adds cjk-utf8 locale * fixes check 3.2 and adds check 3.3 git-svn-id: http://svn.inspircd.org/repository/trunk/inspircd@11082 e03df62e-2008-0410-955e-edbf42e46eb7 --- locales/readme.txt | 5 +-- src/modules/m_nationalchars.cpp | 70 ++++++++++++++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/locales/readme.txt b/locales/readme.txt index 1304adbb5..069e664b5 100755 --- a/locales/readme.txt +++ b/locales/readme.txt @@ -23,7 +23,9 @@ Can be usefull for example for comparing nicknames that contains similar-looking 6: List of additional UTF-8 allowed characters -7: List of additional UTF-8 ranges (character followed by "range"). Strongly experimental. May be replaced in future versions. +7: List of additional UTF-8 ranges (character followed by 1-byte "range"). + +8: List of additional UTF-8 ranges (i.e. start1, end1, start2, end2,... UTF8-characters between each start-end pair assumed valid). *** Line format *** @@ -44,5 +46,4 @@ In this case every character of line except first dot specifies one character-co *** TODO *** -- Maybe replace line 7 with form? - UTF-8 collation rules (Inapplieable to InspIRCd atm). diff --git a/src/modules/m_nationalchars.cpp b/src/modules/m_nationalchars.cpp index 17be414a6..5d2be96d7 100755 --- a/src/modules/m_nationalchars.cpp +++ b/src/modules/m_nationalchars.cpp @@ -12,7 +12,8 @@ */ /* Contains a code of Unreal IRCd + Bynets patch ( http://www.unrealircd.com/ and http://www.bynets.org/ ) - Changed at 2008-06-15 - 2008-12-15 + Original patch is made by Dmitry "Killer{R}" Kononko. ( http://killprog.com/ ) + Changed at 2008-06-15 - 2009-02-11 by Chernov-Phoenix Alexey (Phoenix@RusNet) mailto:phoenix /email address separator/ pravmail.ru */ #include "inspircd.h" @@ -31,13 +32,14 @@ class lwbNickHandler : public HandlerBase2 }; /*,m_reverse_additionalUp[256];*/ -static unsigned char m_reverse_additional[256],m_additionalMB[256],m_additionalUtf8[256],m_additionalUtf8range[256]; +static unsigned char m_reverse_additional[256],m_additionalMB[256],m_additionalUtf8[256],m_additionalUtf8range[256],m_additionalUtf8interval[256]; char utf8checkrest(unsigned char * mb, unsigned char cnt) { for (unsigned char * tmp=mb; tmp 191)) + /* & is faster! -- Phoenix (char & b11000000 == b10000000) */ + if ((*tmp & 192) != 128) return -1; } return cnt + 1; @@ -107,7 +109,7 @@ bool lwbNickHandler::Call(const char* n, size_t max) continue; /* 3.1. Check against a simple UTF-8 characters enumeration */ - int cursize, ncursize = utf8size((unsigned char *)i); + int cursize, cursize2, ncursize = utf8size((unsigned char *)i); /* do check only if current multibyte character is valid UTF-8 only */ if (ncursize != -1) { @@ -130,25 +132,39 @@ bool lwbNickHandler::Call(const char* n, size_t max) if (found) continue; - /* 3.2. Check against an UTF-8 ranges: and . - Also char. is to be checked if it is a valid UTF-8 one */ + /* 3.2. Check against an UTF-8 ranges: and . */ found = false; for (unsigned char * mb = m_additionalUtf8range; (utf8size(mb) != -1) && (mb < m_additionalUtf8range + sizeof(m_additionalUtf8range)); mb += cursize + 1) { cursize = utf8size(mb); - /* Size differs? Pick the next! */ + /* Size differs (or lengthbyte is zero)? Pick the next! */ if ((cursize != ncursize) || (!mb[cursize])) continue; - unsigned char uright[5] = {0,0,0,0,0}; - + unsigned char uright[5] = {0,0,0,0,0}, range = mb[cursize] - 1; strncpy((char* ) uright, (char *) mb, cursize); - if ((uright[cursize-1] + mb[cursize]-1>0xff) && (cursize != 1)) + for (int temp = cursize - 1; (temp >= 0) && range; --temp) { - uright[cursize - 2]+=1; + /* all but the first char are 64-based */ + if (temp) + { + char part64 = range & 63; /* i.e. % 64 */ + /* handle carrying over */ + if (uright[temp] + part64 - 1 > 191) + { + uright[temp] -= 64; + range += 64; + } + uright[temp] += part64; + range >>= 6; /* divide it on a 64 */ + } + /* the first char of UTF-8 doesn't follow the rule */ + else + { + uright[temp] += range; + } } - uright[cursize - 1] = (uright[cursize - 1]+mb[cursize] - 1) % 0x100; if ((strncmp(i, (char *) mb, cursize) >= 0) && (strncmp(i, (char *) uright, cursize) <= 0)) { @@ -160,6 +176,30 @@ bool lwbNickHandler::Call(const char* n, size_t max) } if (found) continue; + + /* 3.3. Check against an UTF-8 intervals: and . */ + found = false; + for (unsigned char * mb = m_additionalUtf8interval; (utf8size(mb) != -1) && (utf8size(mb+utf8size(mb)) != -1) + && (mb < m_additionalUtf8interval + sizeof(m_additionalUtf8interval)); mb += (cursize+cursize2) ) + { + cursize = utf8size(mb); + cursize2= utf8size(mb+cursize); + + int minlen = cursize > ncursize ? ncursize : cursize; + int minlen2 = cursize2 > ncursize ? ncursize : cursize2; + + unsigned char* uright = mb + cursize; + + if ((strncmp(i, (char *) mb, minlen) >= 0) && (strncmp(i, (char *) uright, minlen2) <= 0)) + { + i += cursize - 1; + p += cursize - 1; + found = true; + break; + } + } + if (found) + continue; } /* invalid character! abort */ @@ -216,8 +256,8 @@ class ModuleNationalChars : public Module charset = conf->ReadValue("nationalchars", "file", 0); casemapping = conf->ReadValue("nationalchars", "casemapping", charset, 0, false); charset.insert(0, "../locales/"); - unsigned char * tables[7] = { m_additional, m_additionalMB, m_additionalUp, m_lower, m_upper, m_additionalUtf8, m_additionalUtf8range }; - loadtables(charset, tables, 7, 5); + unsigned char * tables[8] = { m_additional, m_additionalMB, m_additionalUp, m_lower, m_upper, m_additionalUtf8, m_additionalUtf8range, m_additionalUtf8interval }; + loadtables(charset, tables, 8, 5); forcequit = conf->ReadFlag("nationalchars", "forcequit", 0); CheckForceQuit("National character set changed"); delete conf; @@ -319,7 +359,7 @@ class ModuleNationalChars : public Module if (buf[0] == '.') /* simple plain-text string after dot */ { i = buf.size() - 1; - + if (i > (maxindex + 1)) i = maxindex + 1; -- cgit v1.2.3