123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462 |
- /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
- #ifndef _XS_UNICODE_H
- #define _XS_UNICODE_H
- int xs_utf8_enc(char buf[4], unsigned int cpoint);
- int xs_is_utf8_cont_byte(char c);
- unsigned int xs_utf8_dec(const char **str);
- int xs_unicode_width(unsigned int cpoint);
- int xs_is_surrogate(unsigned int cpoint);
- int xs_is_diacritic(unsigned int cpoint);
- unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
- unsigned int xs_surrogate_enc(unsigned int cpoint);
- unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
- unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
- #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
- #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
- unsigned int xs_unicode_to_upper(unsigned int cpoint);
- unsigned int xs_unicode_to_lower(unsigned int cpoint);
- int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
- int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
- int xs_unicode_is_alpha(unsigned int cpoint);
- int xs_unicode_is_right_to_left(unsigned int cpoint);
- #ifdef _XS_H
- xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
- xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
- xs_str *xs_utf8_to_upper(const char *str);
- xs_str *xs_utf8_to_lower(const char *str);
- xs_str *xs_utf8_to_nfd(const char *str);
- xs_str *xs_utf8_to_nfc(const char *str);
- #endif
- #ifdef XS_IMPLEMENTATION
- #include <ctype.h>
- #ifndef xs_countof
- #define xs_countof(a) (sizeof((a)) / sizeof((*a)))
- #endif
- int xs_utf8_enc(char buf[4], unsigned int cpoint)
- /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
- {
- char *p = buf;
- if (cpoint < 0x80) /* 1 byte char */
- *p++ = cpoint & 0xff;
- else {
- if (cpoint < 0x800) /* 2 byte char */
- *p++ = 0xc0 | (cpoint >> 6);
- else {
- if (cpoint < 0x10000) /* 3 byte char */
- *p++ = 0xe0 | (cpoint >> 12);
- else { /* 4 byte char */
- *p++ = 0xf0 | (cpoint >> 18);
- *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
- }
- *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
- }
- *p++ = 0x80 | (cpoint & 0x3f);
- }
- return p - buf;
- }
- int xs_is_utf8_cont_byte(char c)
- /* returns true if c is an utf8 continuation byte */
- {
- return ((c & 0xc0) == 0x80);
- }
- unsigned int xs_utf8_dec(const char **str)
- /* decodes an utf-8 char inside str and updates the pointer */
- {
- const char *p = *str;
- unsigned int cpoint = 0;
- unsigned char c = *p++;
- int cb = 0;
- if ((c & 0x80) == 0) { /* 1 byte char */
- cpoint = c;
- }
- else
- if ((c & 0xe0) == 0xc0) { /* 2 byte char */
- cpoint = (c & 0x1f) << 6;
- cb = 1;
- }
- else
- if ((c & 0xf0) == 0xe0) { /* 3 byte char */
- cpoint = (c & 0x0f) << 12;
- cb = 2;
- }
- else
- if ((c & 0xf8) == 0xf0) { /* 4 byte char */
- cpoint = (c & 0x07) << 18;
- cb = 3;
- }
- /* process the continuation bytes */
- while (cb > 0 && *p && xs_is_utf8_cont_byte(*p))
- cpoint |= (*p++ & 0x3f) << (--cb * 6);
- /* incomplete or broken? */
- if (cb)
- cpoint = 0xfffd;
- *str = p;
- return cpoint;
- }
- /** Unicode character width: intentionally dead simple **/
- static unsigned int xs_unicode_width_table[] = {
- 0x300, 0x36f, 0, /* diacritics */
- 0x1100, 0x11ff, 2, /* Hangul */
- 0x2e80, 0xa4cf, 2, /* CJK */
- 0xac00, 0xd7a3, 2, /* more Hangul */
- 0xe000, 0xf8ff, 0, /* private use */
- 0xf900, 0xfaff, 2, /* CJK compatibility */
- 0xff00, 0xff60, 2, /* full width things */
- 0xffdf, 0xffe6, 2, /* full width things */
- 0x1f200, 0x1ffff, 2, /* emojis */
- 0x20000, 0x2fffd, 2 /* more CJK */
- };
- int xs_unicode_width(unsigned int cpoint)
- /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
- {
- int b = 0;
- int t = xs_countof(xs_unicode_width_table) / 3 - 1;
- while (t >= b) {
- int n = (b + t) / 2;
- unsigned int *p = &xs_unicode_width_table[n * 3];
- if (cpoint < p[0])
- t = n - 1;
- else
- if (cpoint > p[1])
- b = n + 1;
- else
- return p[2];
- }
- return 1;
- }
- int xs_is_diacritic(unsigned int cpoint)
- {
- return cpoint >= 0x300 && cpoint <= 0x36f;
- }
- /** surrogate pairs **/
- int xs_is_surrogate(unsigned int cpoint)
- /* checks if cpoint is the first element of a Unicode surrogate pair */
- {
- return cpoint >= 0xd800 && cpoint <= 0xdfff;
- }
- unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
- /* "decodes" a surrogate pair into a codepoint */
- {
- return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
- }
- unsigned int xs_surrogate_enc(unsigned int cpoint)
- /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
- {
- unsigned int p1 = 0xd7c0 + (cpoint >> 10);
- unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
- return (p1 << 16) | p2;
- }
- #ifdef _XS_H
- xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
- /* encodes an Unicode codepoint to utf-8 into str */
- {
- char tmp[4];
- int c = xs_utf8_enc(tmp, cpoint);
- str = xs_insert_m(str, *offset, tmp, c);
- *offset += c;
- return str;
- }
- xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
- /* encodes an Unicode codepoint to utf-8 into str */
- {
- int offset = strlen(str);
- return xs_utf8_insert(str, cpoint, &offset);
- }
- #endif /* _XS_H */
- #ifdef _XS_UNICODE_TBL_H
- /* include xs_unicode_tbl.h before this one to use these functions */
- unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
- /* searches for an uppercase codepoint in the case fold table */
- {
- int b = 0;
- int t = xs_countof(xs_unicode_case_fold_table) / 2 + 1;
- while (t >= b) {
- int n = (b + t) / 2;
- unsigned int *p = &xs_unicode_case_fold_table[n * 2];
- if (cpoint < p[0])
- t = n - 1;
- else
- if (cpoint > p[0])
- b = n + 1;
- else
- return p;
- }
- return NULL;
- }
- unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
- /* searches for a lowercase codepoint in the case fold table */
- {
- unsigned int *p = xs_unicode_case_fold_table;
- unsigned int *e = p + xs_countof(xs_unicode_case_fold_table);
- while (p < e) {
- if (cpoint == p[1])
- return p;
- p += 2;
- }
- return NULL;
- }
- unsigned int xs_unicode_to_lower(unsigned int cpoint)
- /* returns the cpoint to lowercase */
- {
- if (cpoint < 0x80)
- return tolower(cpoint);
- unsigned int *p = _xs_unicode_upper_search(cpoint);
- return p == NULL ? cpoint : p[1];
- }
- unsigned int xs_unicode_to_upper(unsigned int cpoint)
- /* returns the cpoint to uppercase */
- {
- if (cpoint < 0x80)
- return toupper(cpoint);
- unsigned int *p = _xs_unicode_lower_search(cpoint);
- return p == NULL ? cpoint : p[0];
- }
- int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
- /* applies unicode Normalization Form D */
- {
- int b = 0;
- int t = xs_countof(xs_unicode_nfd_table) / 3 - 1;
- while (t >= b) {
- int n = (b + t) / 2;
- unsigned int *p = &xs_unicode_nfd_table[n * 3];
- int c = cpoint - p[0];
- if (c < 0)
- t = n - 1;
- else
- if (c > 0)
- b = n + 1;
- else {
- *base = p[1];
- *diac = p[2];
- return 1;
- }
- }
- return 0;
- }
- int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
- /* applies unicode Normalization Form C */
- {
- unsigned int *p = xs_unicode_nfd_table;
- unsigned int *e = p + xs_countof(xs_unicode_nfd_table);
- while (p < e) {
- if (p[1] == base && p[2] == diac) {
- *cpoint = p[0];
- return 1;
- }
- p += 3;
- }
- return 0;
- }
- int xs_unicode_is_alpha(unsigned int cpoint)
- /* checks if a codepoint is an alpha (i.e. a letter) */
- {
- int b = 0;
- int t = xs_countof(xs_unicode_alpha_table) / 2 - 1;
- while (t >= b) {
- int n = (b + t) / 2;
- unsigned int *p = &xs_unicode_alpha_table[n * 2];
- if (cpoint < p[0])
- t = n - 1;
- else
- if (cpoint > p[1])
- b = n + 1;
- else
- return 1;
- }
- return 0;
- }
- int xs_unicode_is_right_to_left(unsigned int cpoint)
- /* checks if a codepoint is a right-to-left letter */
- {
- int b = 0;
- int t = xs_countof(xs_unicode_right_to_left_table) / 2 - 1;
- while (t >= b) {
- int n = (b + t) / 2;
- unsigned int *p = &xs_unicode_right_to_left_table[n * 2];
- if (cpoint < p[0])
- t = n - 1;
- else
- if (cpoint > p[1])
- b = n + 1;
- else
- return 1;
- }
- return 0;
- }
- #ifdef _XS_H
- xs_str *xs_utf8_to_upper(const char *str)
- {
- xs_str *s = xs_str_new(NULL);
- unsigned int cpoint;
- int offset = 0;
- while ((cpoint = xs_utf8_dec(&str))) {
- cpoint = xs_unicode_to_upper(cpoint);
- s = xs_utf8_insert(s, cpoint, &offset);
- }
- return s;
- }
- xs_str *xs_utf8_to_lower(const char *str)
- {
- xs_str *s = xs_str_new(NULL);
- unsigned int cpoint;
- int offset = 0;
- while ((cpoint = xs_utf8_dec(&str))) {
- cpoint = xs_unicode_to_lower(cpoint);
- s = xs_utf8_insert(s, cpoint, &offset);
- }
- return s;
- }
- xs_str *xs_utf8_to_nfd(const char *str)
- {
- xs_str *s = xs_str_new(NULL);
- unsigned int cpoint;
- int offset = 0;
- while ((cpoint = xs_utf8_dec(&str))) {
- unsigned int base;
- unsigned int diac;
- if (xs_unicode_nfd(cpoint, &base, &diac)) {
- s = xs_utf8_insert(s, base, &offset);
- s = xs_utf8_insert(s, diac, &offset);
- }
- else
- s = xs_utf8_insert(s, cpoint, &offset);
- }
- return s;
- }
- xs_str *xs_utf8_to_nfc(const char *str)
- {
- xs_str *s = xs_str_new(NULL);
- unsigned int cpoint;
- unsigned int base = 0;
- int offset = 0;
- while ((cpoint = xs_utf8_dec(&str))) {
- if (xs_is_diacritic(cpoint)) {
- if (xs_unicode_nfc(base, cpoint, &base))
- continue;
- }
- if (base)
- s = xs_utf8_insert(s, base, &offset);
- base = cpoint;
- }
- if (base)
- s = xs_utf8_insert(s, base, &offset);
- return s;
- }
- #endif /* _XS_H */
- #endif /* _XS_UNICODE_TBL_H */
- #endif /* XS_IMPLEMENTATION */
- #endif /* _XS_UNICODE_H */
|