Patch by Phoenix

* adds line 8 for additional UTF-8 ranges * adds cjk-utf8 locale * fixes check 3.2 and adds check 3.3 git-svn-id: http://svn.inspircd.org/repository/trunk/inspircd@11082 e03df62e-2008-0410-955e-edbf42e46eb7
author: peavey <peavey@e03df62e-2008-0410-955e-edbf42e46eb7> 2009-02-11 10:51:18 +0000
committer: peavey <peavey@e03df62e-2008-0410-955e-edbf42e46eb7> 2009-02-11 10:51:18 +0000
commit: bbf472cb954ce729fba5804c6c49faa473a10748 (patch)
tree: 3875432b562294ebd8a97fd356db76dea320f34c
parent: 35d20fbabe8babe761e938e4c9a65c47f303f81d (diff)
2 files changed, 58 insertions, 17 deletions
diff --git a/locales/readme.txt b/locales/readme.txt
index 1304adbb5..069e664b5 100755
--- a/locales/readme.txt
+++ b/locales/readme.txt
@@ -23,7 +23,9 @@ Can be usefull for example for comparing nicknames that contains similar-looking
 
 6: List of additional UTF-8 allowed characters
 
-7: List of additional UTF-8 ranges (character followed by "range"). Strongly experimental. May be replaced in future versions.
+7: List of additional UTF-8 ranges (character followed by 1-byte "range").
+
+8: List of additional UTF-8 ranges (i.e. start1, end1, start2, end2,... UTF8-characters between each start-end pair assumed valid).
 
 *** Line format ***
 
@@ -44,5 +46,4 @@ In this case every character of line except first dot specifies one character-co
 
 *** TODO ***
 
-- Maybe replace line 7 with <interval start> <interval end> form?
 - UTF-8 collation rules (Inapplieable to InspIRCd atm).
diff --git a/src/modules/m_nationalchars.cpp b/src/modules/m_nationalchars.cpp
index 17be414a6..5d2be96d7 100755
--- a/src/modules/m_nationalchars.cpp
+++ b/src/modules/m_nationalchars.cpp
@@ -12,7 +12,8 @@
  */
 
 /* Contains a code of Unreal IRCd + Bynets patch ( http://www.unrealircd.com/ and http://www.bynets.org/ )
-   Changed at 2008-06-15 - 2008-12-15
+   Original patch is made by Dmitry "Killer{R}" Kononko. ( http://killprog.com/ )
+   Changed at 2008-06-15 - 2009-02-11
    by Chernov-Phoenix Alexey (Phoenix@RusNet) mailto:phoenix /email address separator/ pravmail.ru */
 
 #include "inspircd.h"
@@ -31,13 +32,14 @@ class lwbNickHandler : public HandlerBase2<bool, const char*, size_t>
 };
 
 								 /*,m_reverse_additionalUp[256];*/
-static unsigned char m_reverse_additional[256],m_additionalMB[256],m_additionalUtf8[256],m_additionalUtf8range[256];
+static unsigned char m_reverse_additional[256],m_additionalMB[256],m_additionalUtf8[256],m_additionalUtf8range[256],m_additionalUtf8interval[256];
 
 char utf8checkrest(unsigned char * mb, unsigned char cnt)
 {
 	for (unsigned char * tmp=mb; tmp<mb+cnt; tmp++)
 	{
-		if ((*tmp < 128) || (*tmp > 191))
+		/* & is faster! -- Phoenix (char & b11000000 == b10000000) */
+		if ((*tmp & 192) != 128)
 			return -1;
 	}
 	return cnt + 1;
@@ -107,7 +109,7 @@ bool lwbNickHandler::Call(const char* n, size_t max)
 			continue;
 
 		/* 3.1. Check against a simple UTF-8 characters enumeration */
-		int cursize, ncursize = utf8size((unsigned char *)i);
+		int cursize, cursize2, ncursize = utf8size((unsigned char *)i);
 		/* do check only if current multibyte character is valid UTF-8 only */
 		if (ncursize != -1)
 		{
@@ -130,25 +132,39 @@ bool lwbNickHandler::Call(const char* n, size_t max)
 			if (found)
 				continue;
 
-			/* 3.2. Check against an UTF-8 ranges: <start character> and <lenght of the range>.
-			Also char. is to be checked if it is a valid UTF-8 one */
+			/* 3.2. Check against an UTF-8 ranges: <start character> and <length of the range>. */
 			found = false;
 			for (unsigned char * mb = m_additionalUtf8range; (utf8size(mb) != -1) && (mb < m_additionalUtf8range + sizeof(m_additionalUtf8range)); mb += cursize + 1)
 			{
 				cursize = utf8size(mb);
-				/* Size differs? Pick the next! */
+				/* Size differs (or lengthbyte is zero)? Pick the next! */
 				if ((cursize != ncursize) || (!mb[cursize]))
 					continue;
 
-				unsigned char uright[5] = {0,0,0,0,0};
-
+				unsigned char uright[5] = {0,0,0,0,0}, range = mb[cursize] - 1;
 				strncpy((char* ) uright, (char *) mb, cursize);
 
-				if ((uright[cursize-1] + mb[cursize]-1>0xff) && (cursize != 1))
+				for (int temp = cursize - 1; (temp >= 0) && range; --temp)
 				{
-					uright[cursize - 2]+=1;
+					/* all but the first char are 64-based */
+					if (temp)
+					{
+						char part64 = range & 63; /* i.e. % 64 */
+						/* handle carrying over */
+						if (uright[temp] + part64 - 1 > 191)
+						{
+							uright[temp] -= 64;
+							range += 64;
+						}
+						uright[temp] += part64;
+						range >>= 6; /* divide it on a 64 */
+					}
+					/* the first char of UTF-8 doesn't follow the rule */
+					else
+					{
+						uright[temp] += range;
+					}
 				}
-				uright[cursize - 1] = (uright[cursize - 1]+mb[cursize] - 1) % 0x100;
 
 				if ((strncmp(i, (char *) mb, cursize) >= 0) && (strncmp(i, (char *) uright, cursize) <= 0))
 				{
@@ -160,6 +176,30 @@ bool lwbNickHandler::Call(const char* n, size_t max)
 			}
 			if (found)
 				continue;
+
+			/* 3.3. Check against an UTF-8 intervals: <start character> and <end character>. */
+			found = false;
+			for (unsigned char * mb = m_additionalUtf8interval; (utf8size(mb) != -1) && (utf8size(mb+utf8size(mb)) != -1)
+				&& (mb < m_additionalUtf8interval + sizeof(m_additionalUtf8interval)); mb += (cursize+cursize2) )
+			{
+				cursize = utf8size(mb);
+				cursize2= utf8size(mb+cursize);
+
+				int minlen  = cursize  > ncursize ? ncursize : cursize;
+				int minlen2 = cursize2 > ncursize ? ncursize : cursize2;
+
+				unsigned char* uright = mb + cursize;
+
+				if ((strncmp(i, (char *) mb, minlen) >= 0) && (strncmp(i, (char *) uright, minlen2) <= 0))
+				{
+					i += cursize - 1;
+					p += cursize - 1;
+					found = true;
+					break;
+				}
+			}
+			if (found)
+				continue;
 		}
 
 		/* invalid character! abort */
@@ -216,8 +256,8 @@ class ModuleNationalChars : public Module
 		charset = conf->ReadValue("nationalchars", "file", 0);
 		casemapping = conf->ReadValue("nationalchars", "casemapping", charset, 0, false);
 		charset.insert(0, "../locales/");
-		unsigned char * tables[7] = { m_additional, m_additionalMB, m_additionalUp, m_lower, m_upper, m_additionalUtf8, m_additionalUtf8range };
-		loadtables(charset, tables, 7, 5);
+		unsigned char * tables[8] = { m_additional, m_additionalMB, m_additionalUp, m_lower, m_upper, m_additionalUtf8, m_additionalUtf8range, m_additionalUtf8interval };
+		loadtables(charset, tables, 8, 5);
 		forcequit = conf->ReadFlag("nationalchars", "forcequit", 0);
 		CheckForceQuit("National character set changed");
 		delete conf;
@@ -319,7 +359,7 @@ class ModuleNationalChars : public Module
 		if (buf[0] == '.')	/* simple plain-text string after dot */
 		{
 			i = buf.size() - 1;
-	
+
 			if (i > (maxindex + 1))
 				i = maxindex + 1;
author	peavey <peavey@e03df62e-2008-0410-955e-edbf42e46eb7>	2009-02-11 10:51:18 +0000
committer	peavey <peavey@e03df62e-2008-0410-955e-edbf42e46eb7>	2009-02-11 10:51:18 +0000
commit	bbf472cb954ce729fba5804c6c49faa473a10748 (patch)
tree	3875432b562294ebd8a97fd356db76dea320f34c
parent	35d20fbabe8babe761e938e4c9a65c47f303f81d (diff)