pissircd/src/modules/charsys.c

1549 lines
49 KiB
C
Raw Permalink Blame History

/*
* Unreal Internet Relay Chat Daemon, src/charsys.c
* (C) Copyright 2005-2017 Bram Matthys and The UnrealIRCd Team.
*
* Character system: This subsystem deals with finding out wheter a
* character should be allowed or not in nicks (nicks only for now).
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 1, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "unrealircd.h"
#include <uchar.h>
#ifndef ARRAY_SIZEOF
#define ARRAY_SIZEOF(x) (sizeof((x))/sizeof((x)[0]))
#endif
ModuleHeader MOD_HEADER
= {
"charsys", /* Name of module */
"5.0-emoji-0.2.1", /* Version */
"Character System (set::allowed-nickchars)", /* Short description of module */
"UnrealIRCd Team",
"unrealircd-6",
};
/* NOTE: it is guaranteed that char is unsigned by compiling options
* (-funsigned-char @ gcc, /J @ MSVC)
* NOTE2: Original credit for supplying the correct chinese
* coderanges goes to: RexHsu, Mr.WebBar and Xuefer
*/
/** Our multibyte structure */
typedef struct MBList MBList;
struct MBList
{
MBList *next;
char s1, e1, s2, e2;
};
MBList *mblist = NULL, *mblist_tail = NULL;
/** Our unicode structure */
typedef struct UList UList;
struct UList
{
UList *next;
char32_t start, end;
};
UList *ulist = NULL, *ulist_tail = NULL;
/* Use this to prevent mixing of certain combinations
* (such as GBK & high-ascii, etc)
*/
static int langav = 0;
char langsinuse[4096];
/* bitmasks: */
#define LANGAV_ASCII 0x000001 /* 8 bit ascii */
#define LANGAV_LATIN1 0x000002 /* latin1 (western europe) */
#define LANGAV_LATIN2 0x000004 /* latin2 (eastern europe, eg: polish) */
#define LANGAV_ISO8859_7 0x000008 /* greek */
#define LANGAV_ISO8859_8I 0x000010 /* hebrew */
#define LANGAV_ISO8859_9 0x000020 /* turkish */
#define LANGAV_W1250 0x000040 /* windows-1250 (eg: polish-w1250) */
#define LANGAV_W1251 0x000080 /* windows-1251 (eg: russian) */
#define LANGAV_LATIN2W1250 0x000100 /* Compatible with both latin2 AND windows-1250 (eg: hungarian) */
#define LANGAV_ISO8859_6 0x000200 /* arabic */
#define LANGAV_GBK 0x001000 /* (Chinese) GBK encoding */
#define LANGAV_UTF8 0x002000 /* any UTF8 encoding */
#define LANGAV_LATIN_UTF8 0x004000 /* UTF8: latin script */
#define LANGAV_CYRILLIC_UTF8 0x008000 /* UTF8: cyrillic script */
#define LANGAV_GREEK_UTF8 0x010000 /* UTF8: greek script */
#define LANGAV_HEBREW_UTF8 0x020000 /* UTF8: hebrew script */
#define LANGAV_ARABIC_UTF8 0x040000 /* UTF8: arabic script */
typedef struct LangList LangList;
struct LangList
{
char *directive;
char *code;
int setflags;
};
/* MUST be alphabetized (first column) */
static LangList langlist[] = {
{ "arabic-utf8", "ara-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_ARABIC_UTF8 },
{ "arabic-utf8", "ara-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_ARABIC_UTF8 },
{ "belarussian-utf8", "blr-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_CYRILLIC_UTF8 },
{ "belarussian-w1251", "blr", LANGAV_ASCII|LANGAV_W1251 },
{ "catalan", "cat", LANGAV_ASCII|LANGAV_LATIN1 },
{ "catalan-utf8", "cat-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "chinese", "chi-j,chi-s,chi-t", LANGAV_GBK },
{ "chinese-ja", "chi-j", LANGAV_GBK },
{ "chinese-simp", "chi-s", LANGAV_GBK },
{ "chinese-trad", "chi-t", LANGAV_GBK },
{ "cyrillic-utf8", "blr-utf8,rus-utf8,ukr-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_CYRILLIC_UTF8 },
{ "czech", "cze-m", LANGAV_ASCII|LANGAV_W1250 },
{ "czech-utf8", "cze-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "danish", "dan", LANGAV_ASCII|LANGAV_LATIN1 },
{ "danish-utf8", "dan-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "dutch", "dut", LANGAV_ASCII|LANGAV_LATIN1 },
{ "dutch-utf8", "dut-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "emoji-utf8", "emo-utf8", LANGAV_UTF8 },
{ "estonian-utf8","est-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "french", "fre", LANGAV_ASCII|LANGAV_LATIN1 },
{ "french-utf8", "fre-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "gbk", "chi-s,chi-t,chi-j", LANGAV_GBK },
{ "german", "ger", LANGAV_ASCII|LANGAV_LATIN1 },
{ "german-utf8", "ger-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "greek", "gre", LANGAV_ASCII|LANGAV_ISO8859_7 },
{ "greek-utf8", "gre-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_GREEK_UTF8 },
{ "hebrew", "heb", LANGAV_ASCII|LANGAV_ISO8859_8I },
{ "hebrew-utf8", "heb-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_HEBREW_UTF8 },
{ "hungarian", "hun", LANGAV_ASCII|LANGAV_LATIN2W1250 },
{ "hungarian-utf8","hun-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "icelandic", "ice", LANGAV_ASCII|LANGAV_LATIN1 },
{ "icelandic-utf8","ice-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "italian", "ita", LANGAV_ASCII|LANGAV_LATIN1 },
{ "italian-utf8", "ita-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "latin-utf8", "cat-utf8,cze-utf8,dan-utf8,dut-utf8,fre-utf8,ger-utf8,hun-utf8,ice-utf8,ita-utf8,pol-utf8,rum-utf8,slo-utf8,spa-utf8,swe-utf8,tur-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "latin1", "cat,dut,fre,ger,ita,spa,swe", LANGAV_ASCII|LANGAV_LATIN1 },
{ "latin2", "hun,pol,rum", LANGAV_ASCII|LANGAV_LATIN2 },
{ "latvian-utf8", "lav-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "lithuanian-utf8","lit-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "polish", "pol", LANGAV_ASCII|LANGAV_LATIN2 },
{ "polish-utf8", "pol-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "polish-w1250", "pol-m", LANGAV_ASCII|LANGAV_W1250 },
{ "romanian", "rum", LANGAV_ASCII|LANGAV_LATIN2W1250 },
{ "romanian-utf8","rum-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "russian-utf8", "rus-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_CYRILLIC_UTF8 },
{ "russian-w1251","rus", LANGAV_ASCII|LANGAV_W1251 },
{ "slovak", "slo-m", LANGAV_ASCII|LANGAV_W1250 },
{ "slovak-utf8", "slo-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "spanish", "spa", LANGAV_ASCII|LANGAV_LATIN1 },
{ "spanish-utf8", "spa-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "swedish", "swe", LANGAV_ASCII|LANGAV_LATIN1 },
{ "swedish-utf8", "swe-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "swiss-german", "swg", LANGAV_ASCII|LANGAV_LATIN1 },
{ "swiss-german-utf8", "swg-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "turkish", "tur", LANGAV_ASCII|LANGAV_ISO8859_9 },
{ "turkish-utf8", "tur-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_LATIN_UTF8 },
{ "ukrainian-utf8", "ukr-utf8", LANGAV_ASCII|LANGAV_UTF8|LANGAV_CYRILLIC_UTF8 },
{ "ukrainian-w1251", "ukr", LANGAV_ASCII|LANGAV_W1251 },
{ "windows-1250", "cze-m,pol-m,rum,slo-m,hun", LANGAV_ASCII|LANGAV_W1250 },
{ "windows-1251", "rus,ukr,blr", LANGAV_ASCII|LANGAV_W1251 },
{ NULL, NULL, 0 }
};
/* For temporary use during config_run */
typedef struct ILangList ILangList;
struct ILangList
{
ILangList *prev, *next;
char *name;
};
ILangList *ilanglist = NULL;
/* These characters are ALWAYS disallowed... from remote, in
* multibyte, etc.. even though this might mean a certain
* (legit) character cannot be used (eg: in chinese GBK).
* - ! (nick!user seperator)
* - prefix chars: +, %, @, &, ~
* - channel chars: #
* - scary chars: $, :, ', ", ?, *, ',', '.'
* NOTE: the caller should also check for ascii <= 32.
* [CHANGING THIS WILL CAUSE SECURITY/SYNCH PROBLEMS AND WILL
* VIOLATE YOUR ""RIGHT"" ON SUPPORT IMMEDIATELY]
*/
const char *illegalnickchars = "!+%@&~#$:'\"?*,.";
/* Forward declarations */
int _do_nick_name(char *nick);
int _do_remote_nick_name(char *nick);
static int do_nick_name_multibyte(char *nick);
static int do_nick_name_standard(char *nick);
void charsys_reset(void);
void charsys_reset_pretest(void);
void charsys_finish(void);
void charsys_addmultibyterange(char s1, char e1, char s2, char e2);
void charsys_addallowed(char *s);
int charsys_test_language(char *name);
void charsys_add_language(char *name);
static void charsys_doadd_language(char *name);
int charsys_config_test(ConfigFile *cf, ConfigEntry *ce, int type, int *errs);
int charsys_config_run(ConfigFile *cf, ConfigEntry *ce, int type);
int charsys_config_posttest(int *errs);
char *_charsys_get_current_languages(void);
MOD_TEST()
{
MARK_AS_OFFICIAL_MODULE(modinfo);
EfunctionAdd(modinfo->handle, EFUNC_DO_NICK_NAME, _do_nick_name);
EfunctionAdd(modinfo->handle, EFUNC_DO_REMOTE_NICK_NAME, _do_remote_nick_name);
EfunctionAddString(modinfo->handle, EFUNC_CHARSYS_GET_CURRENT_LANGUAGES, _charsys_get_current_languages);
charsys_reset();
charsys_reset_pretest();
HookAdd(modinfo->handle, HOOKTYPE_CONFIGTEST, 0, charsys_config_test);
HookAdd(modinfo->handle, HOOKTYPE_CONFIGPOSTTEST, 0, charsys_config_posttest);
return MOD_SUCCESS;
}
MOD_INIT()
{
MARK_AS_OFFICIAL_MODULE(modinfo);
HookAdd(modinfo->handle, HOOKTYPE_CONFIGRUN, 0, charsys_config_run);
return MOD_SUCCESS;
}
/* Is first run when server is 100% ready */
MOD_LOAD()
{
charsys_finish();
return MOD_SUCCESS;
}
/* Called when module is unloaded */
MOD_UNLOAD()
{
return MOD_SUCCESS;
}
int charsys_config_test(ConfigFile *cf, ConfigEntry *ce, int type, int *errs)
{
int errors = 0;
ConfigEntry *cep;
if (type != CONFIG_SET)
return 0;
/* We are only interested in set::allowed-nickchars... */
if (!ce || !ce->name || strcmp(ce->name, "allowed-nickchars"))
return 0;
if (ce->value)
{
config_error("%s:%i: set::allowed-nickchars: please use 'allowed-nickchars { name; };' "
"and not 'allowed-nickchars name;'",
ce->file->filename, ce->line_number);
/* Give up immediately. Don't bother the user with any other errors. */
errors++;
*errs = errors;
return -1;
}
for (cep = ce->items; cep; cep=cep->next)
{
if (!charsys_test_language(cep->name))
{
config_error("%s:%i: set::allowed-nickchars: Unknown (sub)language '%s'",
ce->file->filename, ce->line_number, cep->name);
errors++;
}
}
*errs = errors;
return errors ? -1 : 1;
}
int charsys_config_run(ConfigFile *cf, ConfigEntry *ce, int type)
{
ConfigEntry *cep;
if (type != CONFIG_SET)
return 0;
/* We are only interrested in set::allowed-nickchars... */
if (!ce || !ce->name || strcmp(ce->name, "allowed-nickchars"))
return 0;
for (cep = ce->items; cep; cep = cep->next)
charsys_add_language(cep->name);
return 1;
}
/** Check if the specified charsets during the TESTING phase can be
* premitted without getting into problems.
* RETURNS: -1 in case of failure, 1 if ok
*/
int charsys_config_posttest(int *errs)
{
int errors = 0;
int x=0;
if ((langav & LANGAV_ASCII) && (langav & LANGAV_GBK))
{
config_error("ERROR: set::allowed-nickchars specifies incorrect combination "
"of languages: high-ascii languages (such as german, french, etc) "
"cannot be mixed with chinese/..");
return -1;
}
if (langav & LANGAV_LATIN_UTF8)
x++;
if (langav & LANGAV_GREEK_UTF8)
x++;
if (langav & LANGAV_CYRILLIC_UTF8)
x++;
if (langav & LANGAV_HEBREW_UTF8)
x++;
if (langav & LANGAV_LATIN1)
x++;
if (langav & LANGAV_LATIN2)
x++;
if (langav & LANGAV_ISO8859_6)
x++;
if (langav & LANGAV_ISO8859_7)
x++;
if (langav & LANGAV_ISO8859_9)
x++;
if (langav & LANGAV_W1250)
x++;
if (langav & LANGAV_W1251)
x++;
if ((langav & LANGAV_LATIN2W1250) && !(langav & LANGAV_LATIN2) && !(langav & LANGAV_W1250))
x++;
if (x > 1)
{
#if 0
// I don't think this should be hard error, right? Some combinations may be problematic, but not all.
if (langav & LANGAV_LATIN_UTF8)
{
config_error("ERROR: set::allowed-nickchars: you cannot combine 'latin-utf8' with any other character set");
errors++;
}
if (langav & LANGAV_GREEK_UTF8)
{
config_error("ERROR: set::allowed-nickchars: you cannot combine 'greek-utf8' with any other character set");
errors++;
}
if (langav & LANGAV_CYRILLIC_UTF8)
{
config_error("ERROR: set::allowed-nickchars: you cannot combine 'cyrillic-utf8' with any other character set");
errors++;
}
if (langav & LANGAV_HEBREW_UTF8)
{
config_error("ERROR: set::allowed-nickchars: you cannot combine 'hebrew-utf8' with any other character set");
errors++;
}
if (langav & LANGAV_ARABIC_UTF8)
{
config_error("ERROR: set::allowed-nickchars: you cannot combine 'arabic-utf8' with any other character set");
errors++;
}
#endif
config_status("WARNING: set::allowed-nickchars: Mixing of charsets (eg: latin1+latin2) may cause display problems");
}
*errs = errors;
return errors ? -1 : 1;
}
/** Called on boot and just before config run */
void charsys_reset(void)
{
int i;
MBList *m, *m_next;
UList *u, *u_next;
/* First, reset everything */
for (i=0; i < 256; i++)
char_atribs[i] &= ~ALLOWN;
for (m=mblist; m; m=m_next)
{
m_next = m->next;
safe_free(m);
}
mblist=mblist_tail=NULL;
for (u = ulist; u; u = u_next)
{
u_next = u->next;
safe_free(u);
}
ulist = ulist_tail = NULL;
/* Then add the default which will always be allowed */
charsys_addallowed("0123456789-ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyzy{|}");
langav = 0;
langsinuse[0] = '\0';
#ifdef DEBUGMODE
if (ilanglist)
abort();
#endif
}
void charsys_reset_pretest(void)
{
langav = 0;
non_utf8_nick_chars_in_use = 0;
}
static inline void ilang_swap(ILangList *one, ILangList *two)
{
char *tmp = one->name;
one->name = two->name;
two->name = tmp;
}
static void ilang_sort(void)
{
ILangList *outer, *inner;
/* Selection sort -- perhaps optimize to qsort/whatever if
* possible? ;)
*/
for (outer=ilanglist; outer; outer=outer->next)
{
for (inner=outer->next; inner; inner=inner->next)
{
if (strcmp(outer->name, inner->name) > 0)
ilang_swap(outer, inner);
}
}
}
void charsys_finish(void)
{
ILangList *e, *e_next;
/* Sort alphabetically */
ilang_sort();
/* [note: this can be optimized] */
langsinuse[0] = '\0';
for (e=ilanglist; e; e=e->next)
{
strlcat(langsinuse, e->name, sizeof(langsinuse));
if (e->next)
strlcat(langsinuse, ",", sizeof(langsinuse));
}
/* Free everything */
for (e=ilanglist; e; e=e_next)
{
e_next=e->next;
safe_free(e->name);
safe_free(e);
}
ilanglist = NULL;
#ifdef DEBUGMODE
if (strlen(langsinuse) > 490)
abort();
#endif
charsys_check_for_changes();
}
/** Add a character range to the multibyte list.
* Eg: charsys_addmultibyterange(0xaa, 0xbb, 0x00, 0xff) for 0xaa00-0xbbff.
* @param s1 Start of highest byte
* @param e1 End of highest byte
* @param s2 Start of lowest byte
* @param e2 End of lowest byte
*/
void charsys_addmultibyterange(char s1, char e1, char s2, char e2)
{
MBList *m = safe_alloc(sizeof(MBList));
m->s1 = s1;
m->e1 = e1;
m->s2 = s2;
m->e2 = e2;
if (mblist_tail)
mblist_tail->next = m;
else
mblist = m;
mblist_tail = m;
}
void charsys_addunicoderange(char32_t start, char32_t end)
{
UList *u = safe_alloc(sizeof(UList));
u->start = start;
u->end = end;
if (ulist_tail)
ulist_tail->next = u;
else
ulist = u;
ulist_tail = u;
}
/** Adds all characters in the specified string to the allowed list. */
void charsys_addallowed(char *s)
{
for (; *s; s++)
{
if ((*s <= 32) || strchr(illegalnickchars, *s))
{
config_error("INTERNAL ERROR: charsys_addallowed() called for illegal characters: %s", s);
#ifdef DEBUGMODE
abort();
#endif
}
char_atribs[(unsigned char)*s] |= ALLOWN;
}
}
void charsys_addallowed_range(unsigned char from, unsigned char to)
{
unsigned char i;
for (i = from; i != to; i++)
char_atribs[i] |= ALLOWN;
}
int _do_nick_name(char *nick)
{
if (mblist || ulist)
return do_nick_name_multibyte(nick);
else
return do_nick_name_standard(nick);
}
static int do_nick_name_standard(char *nick)
{
int len;
char *ch;
if ((*nick == '-') || isdigit(*nick))
return 0;
for (ch=nick,len=0; *ch && len <= NICKLEN; ch++, len++)
if (!isvalid(*ch))
return 0; /* reject the full nick */
*ch = '\0';
return len;
}
static int isvalidmbyte(unsigned char c1, unsigned char c2)
{
MBList *m;
for (m=mblist; m; m=m->next)
{
if ((c1 >= m->s1) && (c1 <= m->e1) &&
(c2 >= m->s2) && (c2 <= m->e2))
return 1;
}
return 0;
}
static int isvalidunicodechar(char32_t uch)
{
UList *u;
for (u = ulist; u; u = u->next)
{
if (uch >= u->start && uch <= u->end)
return 1;
}
return 0;
}
static int isvalidunicodenick(char32_t *unick)
{
char32_t *uch, *puch = 0;
/* Consecutive number of U+1F3F4 (waving black flag) and U+E0030..U+E0039 (tag digit one..nine)
* plus U+E0061..U+E007A (tag latin small letter a..z) code points */
int black_flag_or_tag_count = 0;
for (uch = unick; *uch; puch = uch, uch++)
{
/* Accept only a single ZWJ (U+200D) or a single VS16 (U+FE0F) potentially followed by a ZWJ.
*
* They always have to follow an emoji (assumed to be a U+2600 or higher code point). */
if ( (*uch == 0x200d || *uch == 0xfe0f) && (!puch || *puch < 0x2600 || *puch == *uch) )
return 0;
/* Don't end unicode sequence on a ZWJ. */
if (*uch <= 0x2600 && puch && *puch == 0x200d)
return 0;
/* U+1F3F4 (waving black flag) followed by a number of U+E0030..U+E0039 (tag digit zero..nine) and
* U+E0061..U+E007A (tag latin small letter a..z) code points are used for regional flags. See [0].
*
* [0] https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences
*/
if (*uch == 0x1f3f4) {
/* We're already in the middle of a flag sequence. */
if (black_flag_or_tag_count)
return 0;
black_flag_or_tag_count = 1;
}
else if ( (*uch >= 0xe0030 && *uch <= 0xe0039) || (*uch >= 0xe0061 && *uch <= 0xe007a) )
{
/* We only support latin tags after a black flag. */
if (black_flag_or_tag_count < 1)
return 0;
black_flag_or_tag_count++;
}
else if (*uch == 0xe007f)
{
/* Cancel tags only come after a black flag and at least one alphanumeric tag. */
if (black_flag_or_tag_count <= 2)
return 0;
black_flag_or_tag_count = 0;
}
}
/* Truncated flag. */
if (black_flag_or_tag_count > 1)
return 0;
/* We can't end on a ZWJ */
if (puch && *puch == 0x200d)
return 0;
return 1;
}
/* hmmm.. there must be some problems with multibyte &
* other high ascii characters I think (such as german etc).
* Not sure if this can be solved? I don't think so... -- Syzop.
*/
static int do_nick_name_multibyte(char *nick)
{
int len;
char *ch;
char32_t uch, unick[NICKLEN+1], *uchp;
int firstmbchar = 0, utf8seq = 0, utf8width = 0;
if ((*nick == '-') || isdigit(*nick))
return 0;
uchp = unick;
for (ch=nick,len=0; *ch && len <= NICKLEN; ch++, len++)
{
/* Some characters are ALWAYS illegal, so they have to be disallowed here */
if ((*ch <= 32) || strchr(illegalnickchars, *ch))
return 0;
if (utf8seq && !firstmbchar)
{
if (((*ch) & 0xc0) == 0x80)
/* Still in a valid UTF-8 sequence */
utf8seq++;
else
/* We were in an UTF-8 sequence but this byte's not valid, bail out */
return 0;
if (utf8seq == utf8width)
{
/* mbtowc is locale dependent so let's YOLO this ourselves */
if (utf8width == 4)
uch = (char32_t)(((ch[-3] & 0x7) << 18) + ((ch[-2] & 0x3f) << 12) + ((ch[-1] & 0x3f) << 6) + (*ch & 0x3f));
else if (utf8width == 3)
uch = (char32_t)(((ch[-2] & 0xf) << 12) + ((ch[-1] & 0x3f) << 6) + (*ch & 0x3f));
/* Disabled to prevent conflicts with existing multi-byte support. */
/*
else if (utf8width == 2)
uch = (char32_t)(((ch[-1] & 0x1f) << 6) + (*ch & 0x3f));
*/
else
/* How did we get here? */
return 0;
if (!isvalidunicodechar(uch))
return 0;
utf8seq = utf8width = 0;
*uchp++ = uch;
}
}
else if (!firstmbchar && ((*ch) & 0xf8) == 0xf0)
{
utf8seq = 1;
utf8width = 4;
}
else if (!firstmbchar && ((*ch) & 0xf0) == 0xe0)
{
utf8seq = 1;
utf8width = 3;
}
/* Disabled to prevent conflicts with existing multi-byte support. */
/*
else if (!firstmbchar && ((*ch) & 0xe0) == 0xc0)
{
utf8seq = 1;
utf8width = 2;
}
*/
else if (firstmbchar)
{
if (!isvalidmbyte(ch[-1], *ch))
return 0;
firstmbchar = 0;
/* Still blisfully ignoring the existing multibyte support on the
* emoji unicode side of things. */
*uchp++ = ch[-1];
*uchp++ = *ch;
}
else if ((*ch) & 0x80)
firstmbchar = 1;
else if (!isvalid(*ch))
return 0;
else
*uchp++ = *ch;
}
if (firstmbchar)
{
ch--;
len--;
}
if (utf8seq)
{
ch -= utf8seq;
len -= utf8seq;
}
*uchp++ = '\0';
if (!isvalidunicodenick(unick))
return 0;
*ch = '\0';
return len;
}
/** Does some very basic checking on remote nickname.
* It's only purpose is not to cause the whole network
* to fall down in pieces, that's all. Display problems
* are not really handled here. They are assumed to have been
* checked by PROTOCTL NICKCHARS= -- Syzop.
*/
int _do_remote_nick_name(char *nick)
{
char *c;
/* Don't allow nicks to start with a digit, ever. */
if ((*nick == '-') || isdigit(*nick))
return 0;
/* Now the other, more relaxed checks.. */
for (c=nick; *c; c++)
if ((*c <= 32) || strchr(illegalnickchars, *c))
return 0;
return (c - nick);
}
static LangList *charsys_find_language(char *name)
{
int start = 0;
int stop = ARRAY_SIZEOF(langlist)-1;
int mid;
while (start <= stop)
{
mid = (start+stop)/2;
if (!langlist[mid].directive || smycmp(name, langlist[mid].directive) < 0)
stop = mid-1;
else if (strcmp(name, langlist[mid].directive) == 0)
return &langlist[mid];
else
start = mid+1;
}
return NULL;
}
static LangList *charsys_find_language_code(char *code)
{
int i;
for (i = 0; langlist[i].code; i++)
if (!strcasecmp(langlist[i].code, code))
return &langlist[i];
return NULL;
}
/** Check if language is available. */
int charsys_test_language(char *name)
{
LangList *l = charsys_find_language(name);
if (l)
{
langav |= l->setflags;
if (!(l->setflags & LANGAV_UTF8))
non_utf8_nick_chars_in_use = 1;
return 1;
}
if (!strcmp(name, "euro-west"))
{
config_error("set::allowed-nickchars: ERROR: 'euro-west' got renamed to 'latin1'");
return 0;
}
return 0;
}
static void charsys_doadd_language(char *name)
{
LangList *l;
ILangList *li;
int found;
char tmp[512], *lang, *p;
l = charsys_find_language(name);
if (!l)
{
#ifdef DEBUGMODE
abort();
#endif
return;
}
strlcpy(tmp, l->code, sizeof(tmp));
for (lang = strtoken(&p, tmp, ","); lang; lang = strtoken(&p, NULL, ","))
{
/* Check if present... */
found=0;
for (li=ilanglist; li; li=li->next)
if (!strcmp(li->name, lang))
{
found = 1;
break;
}
if (!found)
{
/* Add... */
li = safe_alloc(sizeof(ILangList));
safe_strdup(li->name, lang);
AddListItem(li, ilanglist);
}
}
}
void charsys_add_language(char *name)
{
char latin1=0, latin2=0, w1250=0, w1251=0, chinese=0;
char latin_utf8=0, cyrillic_utf8=0;
/** Note: there could well be some characters missing in the lists below.
* While I've seen other altnernatives that just allow pretty much
* every accent that exists even for dutch (where we rarely use
* accents except for like 3 types), I rather prefer to use a bit more
* reasonable aproach ;). That said, anyone is welcome to make
* suggestions about characters that should be added (or removed)
* of course. -- Syzop
*/
/* Add our language to our list */
charsys_doadd_language(name);
/* GROUPS */
if (!strcmp(name, "latin-utf8"))
latin_utf8 = 1;
else if (!strcmp(name, "cyrillic-utf8"))
cyrillic_utf8 = 1;
else if (!strcmp(name, "latin1"))
latin1 = 1;
else if (!strcmp(name, "latin2"))
latin2 = 1;
else if (!strcmp(name, "windows-1250"))
w1250 = 1;
else if (!strcmp(name, "windows-1251"))
w1251 = 1;
else if (!strcmp(name, "chinese") || !strcmp(name, "gbk"))
chinese = 1;
/* INDIVIDUAL CHARSETS */
/* [LATIN1] and [LATIN-UTF8] */
if (latin1 || !strcmp(name, "german"))
{
/* a", A", o", O", u", U" and es-zett */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "german-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x84, 0x84);
charsys_addmultibyterange(0xc3, 0xc3, 0x96, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0x9f, 0x9f);
charsys_addmultibyterange(0xc3, 0xc3, 0xa4, 0xa4);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
}
if (latin1 || !strcmp(name, "swiss-german"))
{
/* a", A", o", O", u", U" */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "swiss-german-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x84, 0x84);
charsys_addmultibyterange(0xc3, 0xc3, 0x96, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0xa4, 0xa4);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
}
if (latin1 || !strcmp(name, "dutch"))
{
/* Ok, even though I'm Dutch myself, I've trouble getting
* a proper list of this ;). I think I got them all now, but
* I did not include "borrow-words" like words we use in Dutch
* that are literal French. So if you really want to use them all,
* I suggest you to use just latin1 :P.
*/
/* e', e", o", i", u", e`. */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "dutch-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0xa8, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xab, 0xab);
charsys_addmultibyterange(0xc3, 0xc3, 0xaf, 0xaf);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
}
if (latin1 || !strcmp(name, "danish"))
{
/* supplied by klaus:
* <ae>, <AE>, ao, Ao, o/, O/ */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "danish-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x85, 0x86);
charsys_addmultibyterange(0xc3, 0xc3, 0x98, 0x98);
charsys_addmultibyterange(0xc3, 0xc3, 0xa5, 0xa6);
charsys_addmultibyterange(0xc3, 0xc3, 0xb8, 0xb8);
}
if (latin1 || !strcmp(name, "french"))
{
/* A`, A^, a`, a^, weird-C, weird-c, E`, E', E^, E", e`, e', e^, e",
* I^, I", i^, i", O^, o^, U`, U^, U", u`, u", u`, y" [not in that order, sry]
* Hmm.. there might be more, but I'm not sure how common they are
* and I don't think they are always displayed correctly (?).
*/
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "french-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x80, 0x80);
charsys_addmultibyterange(0xc3, 0xc3, 0x82, 0x82);
charsys_addmultibyterange(0xc3, 0xc3, 0x87, 0x8b);
charsys_addmultibyterange(0xc3, 0xc3, 0x8e, 0x8f);
charsys_addmultibyterange(0xc3, 0xc3, 0x94, 0x94);
charsys_addmultibyterange(0xc3, 0xc3, 0x99, 0x99);
charsys_addmultibyterange(0xc3, 0xc3, 0x9b, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0xa0, 0xa0);
charsys_addmultibyterange(0xc3, 0xc3, 0xa2, 0xa2);
charsys_addmultibyterange(0xc3, 0xc3, 0xa7, 0xab);
charsys_addmultibyterange(0xc3, 0xc3, 0xae, 0xaf);
charsys_addmultibyterange(0xc3, 0xc3, 0xb4, 0xb4);
charsys_addmultibyterange(0xc3, 0xc3, 0xb9, 0xb9);
charsys_addmultibyterange(0xc3, 0xc3, 0xbb, 0xbc);
charsys_addmultibyterange(0xc3, 0xc3, 0xbf, 0xbf);
}
if (latin1 || !strcmp(name, "spanish"))
{
/* a', A', e', E', i', I', o', O', u', U', u", U", n~, N~ */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "spanish-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x81, 0x81);
charsys_addmultibyterange(0xc3, 0xc3, 0x89, 0x89);
charsys_addmultibyterange(0xc3, 0xc3, 0x8d, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0x91, 0x91);
charsys_addmultibyterange(0xc3, 0xc3, 0x93, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0x9a, 0x9a);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0xa1, 0xa1);
charsys_addmultibyterange(0xc3, 0xc3, 0xa9, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xad, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xb1, 0xb1);
charsys_addmultibyterange(0xc3, 0xc3, 0xb3, 0xb3);
charsys_addmultibyterange(0xc3, 0xc3, 0xba, 0xba);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
}
if (latin1 || !strcmp(name, "italian"))
{
/* A`, E`, E', I`, I', O`, O', U`, U', a`, e`, e', i`, i', o`, o', u`, u' */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "italian-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x80, 0x80);
charsys_addmultibyterange(0xc3, 0xc3, 0x88, 0x89);
charsys_addmultibyterange(0xc3, 0xc3, 0x8c, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0x92, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0x99, 0x9a);
charsys_addmultibyterange(0xc3, 0xc3, 0xa0, 0xa0);
charsys_addmultibyterange(0xc3, 0xc3, 0xa8, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xac, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xb2, 0xb3);
charsys_addmultibyterange(0xc3, 0xc3, 0xb9, 0xba);
}
if (latin1 || !strcmp(name, "catalan"))
{
/* supplied by Trocotronic */
/* a`, A`, e`, weird-c, weird-C, E`, e', E', i', I', o`, O`, o', O', u', U', i", I", u", U", weird-dot */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "catalan-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x80, 0x80);
charsys_addmultibyterange(0xc3, 0xc3, 0x87, 0x89);
charsys_addmultibyterange(0xc3, 0xc3, 0x8d, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0x8f, 0x8f);
charsys_addmultibyterange(0xc3, 0xc3, 0x92, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0x9a, 0x9a);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0xa0, 0xa0);
charsys_addmultibyterange(0xc3, 0xc3, 0xa7, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xad, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xaf, 0xaf);
charsys_addmultibyterange(0xc3, 0xc3, 0xb2, 0xb3);
charsys_addmultibyterange(0xc3, 0xc3, 0xba, 0xba);
}
if (latin1 || !strcmp(name, "swedish"))
{
/* supplied by Tank */
/* ao, Ao, a", A", o", O" */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "swedish-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x84, 0x85);
charsys_addmultibyterange(0xc3, 0xc3, 0x96, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0xa4, 0xa5);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
}
if (latin1 || !strcmp(name, "icelandic"))
{
/* supplied by Saevar */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "icelandic-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x81, 0x81);
charsys_addmultibyterange(0xc3, 0xc3, 0x86, 0x86);
charsys_addmultibyterange(0xc3, 0xc3, 0x8d, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0x90, 0x90);
charsys_addmultibyterange(0xc3, 0xc3, 0x93, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0x96, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0x9a, 0x9a);
charsys_addmultibyterange(0xc3, 0xc3, 0x9d, 0x9e);
charsys_addmultibyterange(0xc3, 0xc3, 0xa1, 0xa1);
charsys_addmultibyterange(0xc3, 0xc3, 0xa6, 0xa6);
charsys_addmultibyterange(0xc3, 0xc3, 0xad, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xb0, 0xb0);
charsys_addmultibyterange(0xc3, 0xc3, 0xb3, 0xb3);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xba, 0xba);
charsys_addmultibyterange(0xc3, 0xc3, 0xbd, 0xbe);
}
/* [LATIN2] and rest of [LATIN-UTF8] */
/* actually hungarian is a special case, include it in both w1250 and latin2 ;p */
if (latin2 || w1250 || !strcmp(name, "hungarian"))
{
/* supplied by AngryWolf */
/* a', e', i', o', o", o~, u', u", u~, A', E', I', O', O", O~, U', U", U~ */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "hungarian-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x81, 0x81);
charsys_addmultibyterange(0xc3, 0xc3, 0x89, 0x89);
charsys_addmultibyterange(0xc3, 0xc3, 0x8d, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0x93, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0x96, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0x9a, 0x9a);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0xa1, 0xa1);
charsys_addmultibyterange(0xc3, 0xc3, 0xa9, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xad, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xb3, 0xb3);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xba, 0xba);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
charsys_addmultibyterange(0xc5, 0xc5, 0x90, 0x91);
charsys_addmultibyterange(0xc5, 0xc5, 0xb0, 0xb1);
}
/* same is true for romanian: latin2 & w1250 compatible */
if (latin2 || w1250 || !strcmp(name, "romanian"))
{
/* With some help from crazytoon */
/* 'S,' 's,' 'A^' 'A<' 'I^' 'T,' 'a^' 'a<' 'i^' 't,' */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "romanian-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x82, 0x82);
charsys_addmultibyterange(0xc3, 0xc3, 0x8e, 0x8e);
charsys_addmultibyterange(0xc3, 0xc3, 0xa2, 0xa2);
charsys_addmultibyterange(0xc3, 0xc3, 0xae, 0xae);
charsys_addmultibyterange(0xc4, 0xc4, 0x82, 0x83);
charsys_addmultibyterange(0xc5, 0xc5, 0x9e, 0x9f);
charsys_addmultibyterange(0xc5, 0xc5, 0xa2, 0xa3);
}
if (latin2 || !strcmp(name, "polish"))
{
/* supplied by k4be */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>󶿼<EFBFBD><EFBFBD>ʣ<EFBFBD>Ӧ<EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "polish-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x93, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0xb3, 0xb3);
charsys_addmultibyterange(0xc4, 0xc4, 0x84, 0x87);
charsys_addmultibyterange(0xc4, 0xc4, 0x98, 0x99);
charsys_addmultibyterange(0xc5, 0xc5, 0x81, 0x84);
charsys_addmultibyterange(0xc5, 0xc5, 0x9a, 0x9b);
charsys_addmultibyterange(0xc5, 0xc5, 0xb9, 0xbc);
}
/* [windows 1250] */
if (w1250 || !strcmp(name, "polish-w1250"))
{
/* supplied by k4be */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>󜿟<EFBFBD><EFBFBD>ʣ<EFBFBD>ӌ<EFBFBD><EFBFBD>");
}
if (w1250 || !strcmp(name, "czech-w1250"))
{
/* Syzop [probably incomplete] */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "czech-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x81, 0x81);
charsys_addmultibyterange(0xc3, 0xc3, 0x89, 0x89);
charsys_addmultibyterange(0xc3, 0xc3, 0x8d, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0x93, 0x93);
charsys_addmultibyterange(0xc3, 0xc3, 0x9a, 0x9a);
charsys_addmultibyterange(0xc3, 0xc3, 0x9d, 0x9d);
charsys_addmultibyterange(0xc3, 0xc3, 0xa1, 0xa1);
charsys_addmultibyterange(0xc3, 0xc3, 0xa9, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xad, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xb3, 0xb3);
charsys_addmultibyterange(0xc3, 0xc3, 0xba, 0xba);
charsys_addmultibyterange(0xc3, 0xc3, 0xbd, 0xbd);
charsys_addmultibyterange(0xc4, 0xc4, 0x8c, 0x8f);
charsys_addmultibyterange(0xc4, 0xc4, 0x9a, 0x9b);
charsys_addmultibyterange(0xc5, 0xc5, 0x87, 0x88);
charsys_addmultibyterange(0xc5, 0xc5, 0x98, 0x99);
charsys_addmultibyterange(0xc5, 0xc5, 0xa0, 0xa1);
charsys_addmultibyterange(0xc5, 0xc5, 0xa4, 0xa5);
charsys_addmultibyterange(0xc5, 0xc5, 0xae, 0xaf);
charsys_addmultibyterange(0xc5, 0xc5, 0xbd, 0xbe);
}
if (w1250 || !strcmp(name, "slovak-w1250"))
{
/* Syzop [probably incomplete] */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (latin_utf8 || !strcmp(name, "slovak-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x81, 0x81);
charsys_addmultibyterange(0xc3, 0xc3, 0x84, 0x84);
charsys_addmultibyterange(0xc3, 0xc3, 0x89, 0x89);
charsys_addmultibyterange(0xc3, 0xc3, 0x8d, 0x8d);
charsys_addmultibyterange(0xc3, 0xc3, 0xa1, 0xa1);
charsys_addmultibyterange(0xc3, 0xc3, 0xa4, 0xa4);
charsys_addmultibyterange(0xc3, 0xc3, 0xa9, 0xa9);
charsys_addmultibyterange(0xc3, 0xc3, 0xad, 0xad);
charsys_addmultibyterange(0xc3, 0xc3, 0xb3, 0xb4);
charsys_addmultibyterange(0xc3, 0xc3, 0xba, 0xba);
charsys_addmultibyterange(0xc3, 0xc3, 0xbd, 0xbd);
charsys_addmultibyterange(0xc4, 0xc4, 0x8c, 0x8f);
charsys_addmultibyterange(0xc4, 0xc4, 0xb9, 0xba);
charsys_addmultibyterange(0xc4, 0xc4, 0xbd, 0xbe);
charsys_addmultibyterange(0xc5, 0xc5, 0x88, 0x88);
charsys_addmultibyterange(0xc5, 0xc5, 0x94, 0x95);
charsys_addmultibyterange(0xc5, 0xc5, 0xa0, 0xa1);
charsys_addmultibyterange(0xc5, 0xc5, 0xa4, 0xa5);
charsys_addmultibyterange(0xc5, 0xc5, 0xbd, 0xbe);
}
/* [windows 1251] */
if (w1251 || !strcmp(name, "russian-w1251"))
{
/* supplied by Roman Parkin:
* 128-159 and 223-254
*/
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (cyrillic_utf8 || !strcmp(name, "russian-utf8"))
{
charsys_addmultibyterange(0xd0, 0xd0, 0x81, 0x81);
charsys_addmultibyterange(0xd0, 0xd0, 0x90, 0xbf);
charsys_addmultibyterange(0xd1, 0xd1, 0x80, 0x8f);
charsys_addmultibyterange(0xd1, 0xd1, 0x91, 0x91);
}
if (w1251 || !strcmp(name, "belarussian-w1251"))
{
/* supplied by Bock (Samets Anton) & ss:
* 128-159, 161, 162, 178, 179 and 223-254
* Corrected 01.11.2006 to more "correct" behavior by Bock
*/
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ũ<EFBFBD>Dz<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӡ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (cyrillic_utf8 || !strcmp(name, "belarussian-utf8"))
{
charsys_addmultibyterange(0xd0, 0xd0, 0x81, 0x81);
charsys_addmultibyterange(0xd0, 0xd0, 0x86, 0x86);
charsys_addmultibyterange(0xd0, 0xd0, 0x8e, 0x8e);
charsys_addmultibyterange(0xd0, 0xd0, 0x90, 0x97);
charsys_addmultibyterange(0xd0, 0xd0, 0x99, 0xa8);
charsys_addmultibyterange(0xd0, 0xd0, 0xab, 0xb7);
charsys_addmultibyterange(0xd0, 0xd0, 0xb9, 0xbf);
charsys_addmultibyterange(0xd1, 0xd1, 0x80, 0x88);
charsys_addmultibyterange(0xd1, 0xd1, 0x8b, 0x8f);
charsys_addmultibyterange(0xd1, 0xd1, 0x91, 0x91);
charsys_addmultibyterange(0xd1, 0xd1, 0x96, 0x96);
charsys_addmultibyterange(0xd1, 0xd1, 0x9e, 0x9e);
}
if (w1251 || !strcmp(name, "ukrainian-w1251"))
{
/* supplied by Anton Samets & ss:
* 128-159, 170, 175, 178, 179, 186, 191 and 223-254
* Corrected 01.11.2006 to more "correct" behavior by core
*/
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD>å<EFBFBD>Ū<EFBFBD><EFBFBD>Ȳ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>賿<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (cyrillic_utf8 || !strcmp(name, "ukrainian-utf8"))
{
charsys_addmultibyterange(0xd0, 0xd0, 0x84, 0x84);
charsys_addmultibyterange(0xd0, 0xd0, 0x86, 0x87);
charsys_addmultibyterange(0xd0, 0xd0, 0x90, 0xa9);
charsys_addmultibyterange(0xd0, 0xd0, 0xac, 0xac);
charsys_addmultibyterange(0xd0, 0xd0, 0xae, 0xbf);
charsys_addmultibyterange(0xd1, 0xd1, 0x80, 0x89);
charsys_addmultibyterange(0xd1, 0xd1, 0x8c, 0x8c);
charsys_addmultibyterange(0xd1, 0xd1, 0x8e, 0x8f);
charsys_addmultibyterange(0xd1, 0xd1, 0x94, 0x94);
charsys_addmultibyterange(0xd1, 0xd1, 0x96, 0x97);
charsys_addmultibyterange(0xd2, 0xd2, 0x90, 0x91);
}
/* [GREEK] */
if (!strcmp(name, "greek"))
{
/* supplied by GSF */
/* ranges from rfc1947 / iso 8859-7 */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (!strcmp(name, "greek-utf8"))
{
charsys_addmultibyterange(0xce, 0xce, 0x86, 0x86);
charsys_addmultibyterange(0xce, 0xce, 0x88, 0x8a);
charsys_addmultibyterange(0xce, 0xce, 0x8c, 0x8c);
charsys_addmultibyterange(0xce, 0xce, 0x8e, 0xa1);
charsys_addmultibyterange(0xce, 0xce, 0xa3, 0xbf);
charsys_addmultibyterange(0xcf, 0xcf, 0x80, 0x84);
}
/* [TURKISH] */
if (!strcmp(name, "turkish"))
{
/* Supplied by Ayberk Yancatoral */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (!strcmp(name, "turkish-utf8"))
{
charsys_addmultibyterange(0xc3, 0xc3, 0x87, 0x87);
charsys_addmultibyterange(0xc3, 0xc3, 0x96, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
charsys_addmultibyterange(0xc3, 0xc3, 0xa7, 0xa7);
charsys_addmultibyterange(0xc3, 0xc3, 0xb6, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
charsys_addmultibyterange(0xc4, 0xc4, 0x9e, 0x9f);
charsys_addmultibyterange(0xc4, 0xc4, 0xb1, 0xb1);
charsys_addmultibyterange(0xc5, 0xc5, 0x9e, 0x9f);
}
/* [HEBREW] */
if (!strcmp(name, "hebrew"))
{
/* Supplied by PHANTOm. */
/* 0xE0 - 0xFE */
charsys_addallowed("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>");
}
if (!strcmp(name, "hebrew-utf8"))
{
/* Supplied by Lion-O */
charsys_addmultibyterange(0xd7, 0xd7, 0x90, 0xaa);
}
/* [CHINESE] */
if (chinese || !strcmp(name, "chinese-ja"))
{
charsys_addmultibyterange(0xa4, 0xa4, 0xa1, 0xf3); /* JIS_PIN */
charsys_addmultibyterange(0xa5, 0xa5, 0xa1, 0xf6); /* JIS_PIN */
}
if (chinese || !strcmp(name, "chinese-simp"))
{
charsys_addmultibyterange(0xb0, 0xd6, 0xa1, 0xfe); /* GBK/2 BC with GB2312 */
charsys_addmultibyterange(0xd7, 0xd7, 0xa1, 0xf9); /* GBK/2 BC with GB2312 */
charsys_addmultibyterange(0xd8, 0xf7, 0xa1, 0xfe); /* GBK/2 BC with GB2312 */
}
if (chinese || !strcmp(name, "chinese-trad"))
{
charsys_addmultibyterange(0x81, 0xa0, 0x40, 0x7e); /* GBK/3 - lower half */
charsys_addmultibyterange(0x81, 0xa0, 0x80, 0xfe); /* GBK/3 - upper half */
charsys_addmultibyterange(0xaa, 0xfe, 0x40, 0x7e); /* GBK/4 - lower half */
charsys_addmultibyterange(0xaa, 0xfe, 0x80, 0xa0); /* GBK/4 - upper half */
}
/* [LATVIAN] */
if (latin_utf8 || !strcmp(name, "latvian-utf8"))
{
/* A a, C c, E e, G g, I i, K k, <20> <20>, U u, <20> <20> */
charsys_addmultibyterange(0xc4, 0xc4, 0x80, 0x81);
charsys_addmultibyterange(0xc4, 0xc4, 0x92, 0x93);
charsys_addmultibyterange(0xc4, 0xc4, 0x8c, 0x8d);
charsys_addmultibyterange(0xc4, 0xc4, 0x92, 0x93);
charsys_addmultibyterange(0xc4, 0xc4, 0xa2, 0xa3);
charsys_addmultibyterange(0xc4, 0xc4, 0xaa, 0xab);
charsys_addmultibyterange(0xc4, 0xc4, 0xb6, 0xb7);
charsys_addmultibyterange(0xc5, 0xc5, 0xa0, 0xa1);
charsys_addmultibyterange(0xc5, 0xc5, 0xaa, 0xab);
charsys_addmultibyterange(0xc5, 0xc5, 0xbd, 0xbe);
}
/* [ESTONIAN] */
if (latin_utf8 || !strcmp(name, "estonian-utf8"))
{
/* <20>, <20>, <20>, <20>, <20>, <20>, <20>, <20> */
charsys_addmultibyterange(0xc3, 0xc3, 0xb5, 0xb6);
charsys_addmultibyterange(0xc3, 0xc3, 0xa4, 0xa4);
charsys_addmultibyterange(0xc3, 0xc3, 0xbc, 0xbc);
charsys_addmultibyterange(0xc3, 0xc3, 0x95, 0x96);
charsys_addmultibyterange(0xc3, 0xc3, 0x84, 0x84);
charsys_addmultibyterange(0xc3, 0xc3, 0x9c, 0x9c);
}
/* [LITHUANIAN] */
if (latin_utf8 || !strcmp(name, "lithuanian-utf8"))
{
/* a, c, e, e, i, <20>, u, u, <20>, A, C, E, E, I, <20>, U, U, <20> */
charsys_addmultibyterange(0xc4, 0xc4, 0x84, 0x85);
charsys_addmultibyterange(0xc4, 0xc4, 0x8c, 0x8d);
charsys_addmultibyterange(0xc4, 0xc4, 0x96, 0x99);
charsys_addmultibyterange(0xc4, 0xc4, 0xae, 0xaf);
charsys_addmultibyterange(0xc4, 0xc4, 0xae, 0xaf);
charsys_addmultibyterange(0xc5, 0xc5, 0xa0, 0xa1);
charsys_addmultibyterange(0xc5, 0xc5, 0xb2, 0xb3);
charsys_addmultibyterange(0xc5, 0xc5, 0xaa, 0xab);
charsys_addmultibyterange(0xc5, 0xc5, 0xbd, 0xbe);
}
/* [ARABIC] */
if (latin_utf8 || !strcmp(name, "arabic-utf8"))
{
/* Supplied by Sensiva */
/*charsys_addallowed("اأإآءبتثجحخدذرزسشصضطظعغفقكلمنهؤةويىئ");*/
/*- From U+0621 to U+063A (Regex: [\u0621-\u063A])*/
/* 0xd8a1 - 0xd8ba */
charsys_addmultibyterange(0xd8, 0xd8, 0xa1, 0xba);
/*- From U+0641 to U+064A (Regex: [\u0641-\u064A])*/
/* 0xd981 - 0xd98a */
charsys_addmultibyterange(0xd9, 0xd9, 0x81, 0x8a);
}
/* [EMOJI] */
if (!strcmp(name, "emoji-utf8"))
{
/* Wikipedia perhaps isn't the best place to base our ranges on
* but it works for now.
*
* ZWJ (U+200D) and Variant Selector 16 (U+FE0F) can be used in
* homoglyph attacks so there's a check in isvalidunicodenick
* to attempt to mitigate (but not remove!) the risk.
*
* Only 3 and 4 byte UTF-8 sequences are supported to make sure
* the UTF-8 decoding doesn't break the existing multi-byte
* support.
*
* isvalidunicodenick needs to be updated if any emoji code
* points under U+2600 are added.
*/
/* https://en.wikipedia.org/wiki/Zero-width_joiner */
charsys_addunicoderange(0x200d, 0x200d);
/* https://en.wikipedia.org/wiki/Miscellaneous_Symbols*/
charsys_addunicoderange(0x2600, 0x26ff);
/* https://en.wikipedia.org/wiki/Dingbat#Encoding */
charsys_addunicoderange(0x2700, 0x27bf);
/* https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block) */
charsys_addunicoderange(0xfe0f, 0xfe0f);
/* https://en.wikipedia.org/wiki/Regional_indicator_symbol */
charsys_addunicoderange(0x1f1e6, 0x1f1ff);
/* https://en.wikipedia.org/wiki/Miscellaneous_Symbols_and_Pictographs */
charsys_addunicoderange(0x1f300, 0x1f5ff);
/* https://en.wikipedia.org/wiki/Emoticons_(Unicode_block)#Emoji_modifiers
* https://en.wikipedia.org/wiki/Miscellaneous_Symbols#Emoji_modifiers
* https://en.wikipedia.org/wiki/Supplemental_Symbols_and_Pictographs#Emoji_modifiers
* https://en.wikipedia.org/wiki/Miscellaneous_Symbols_and_Pictographs#Emoji_modifiers
* https://en.wikipedia.org/wiki/Transport_and_Map_Symbols#Emoji_modifiers
* https://en.wikipedia.org/wiki/Dingbat#Emoji_modifiers
*
* These are included in the previous range but still listed
* separately due to their importance.
*/
charsys_addunicoderange(0x1f3fb, 0x1f3ff);
/* https://en.wikipedia.org/wiki/Emoticons_(Unicode_block) */
charsys_addunicoderange(0x1f600, 0x1f64f);
/* https://en.wikipedia.org/wiki/Transport_and_Map_Symbols */
charsys_addunicoderange(0x1f680, 0x1f6ff);
/* https://en.wikipedia.org/wiki/Supplemental_Symbols_and_Pictographs */
charsys_addunicoderange(0x1f900, 0x1f9ff);
/* https://en.wikipedia.org/wiki/Symbols_and_Pictographs_Extended-A */
charsys_addunicoderange(0x1fa70, 0x1faff);
/* https://www.unicode.org/reports/tr51/#valid-emoji-tag-sequences */
/* tag digit zero .. tag digit nine */
charsys_addunicoderange(0xe0030, 0xe0039);
/* tag latin small letter a .. tag latin small letter z */
charsys_addunicoderange(0xe0061, 0xe007a);
/* cancel tag */
charsys_addunicoderange(0xe007f, 0xe007f);
}
}
/** This displays all the nick characters that are permitted */
char *charsys_displaychars(void)
{
/* TODO: Add ulist rendering */
#if 0
MBList *m;
unsigned char hibyte, lobyte;
#endif
static char buf[512];
int n = 0;
int i, j;
// char_atribs[(unsigned char)*s] |= ALLOWN;
for (i = 0; i <= 255; i++)
{
if (char_atribs[i] & ALLOWN)
buf[n++] = i;
/* (no bounds checking: first 255 characters always fit a 512 byte buffer) */
}
#if 0
for (m=mblist; m; m=m->next)
{
for (hibyte = m->s1; hibyte <= m->e1; hibyte++)
{
for (lobyte = m->s2; lobyte <= m->e2; lobyte++)
{
if (n >= sizeof(buf) - 3)
break; // break, or an attempt anyway
buf[n++] = hibyte;
buf[n++] = lobyte;
}
}
}
#endif
/* above didn't work due to multiple overlapping ranges permitted.
* try this instead (lazy).. this is only used in DEBUGMODE
* via a command line option anyway:
*/
for (i=0; i <= 255; i++)
{
for (j=0; j <= 255; j++)
{
if (isvalidmbyte(i, j))
{
if (n >= sizeof(buf) - 3)
break; // break, or an attempt anyway
buf[n++] = i;
buf[n++] = j;
}
}
}
buf[n] = '\0'; /* there's always room for a NUL */
return buf;
}
char *charsys_group(int v)
{
if (v & LANGAV_LATIN_UTF8)
return "Latin script";
if (v & LANGAV_CYRILLIC_UTF8)
return "Cyrillic script";
if (v & LANGAV_GREEK_UTF8)
return "Greek script";
if (v & LANGAV_HEBREW_UTF8)
return "Hebrew script";
if (v & LANGAV_ARABIC_UTF8)
return "Arabic script";
return "Other";
}
void charsys_dump_table(char *filter)
{
int i = 0;
for (i = 0; langlist[i].directive; i++)
{
char *charset = langlist[i].directive;
if (!match_simple(filter, charset))
continue; /* skip */
charsys_reset();
charsys_add_language(charset);
charsys_finish();
printf("%s;%s;%s\n", charset, charsys_group(langlist[i].setflags), charsys_displaychars());
}
}
/** Get current languages (the 'langsinuse' variable) */
char *_charsys_get_current_languages(void)
{
return langsinuse;
}