xs_unicode.h 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. int xs_is_utf8_cont_byte(char c);
  6. unsigned int xs_utf8_dec(const char **str);
  7. int xs_unicode_width(unsigned int cpoint);
  8. int xs_is_surrogate(unsigned int cpoint);
  9. int xs_is_diacritic(unsigned int cpoint);
  10. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
  11. unsigned int xs_surrogate_enc(unsigned int cpoint);
  12. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  13. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  14. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  15. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  16. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  17. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  18. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  19. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  20. int xs_unicode_is_alpha(unsigned int cpoint);
  21. #ifdef _XS_H
  22. xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
  23. xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
  24. xs_str *xs_utf8_to_upper(const char *str);
  25. xs_str *xs_utf8_to_lower(const char *str);
  26. xs_str *xs_utf8_to_nfd(const char *str);
  27. xs_str *xs_utf8_to_nfc(const char *str);
  28. #endif
  29. #ifdef XS_IMPLEMENTATION
  30. #include <ctype.h>
  31. #ifndef xs_countof
  32. #define xs_countof(a) (sizeof((a)) / sizeof((*a)))
  33. #endif
  34. int xs_utf8_enc(char buf[4], unsigned int cpoint)
  35. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  36. {
  37. char *p = buf;
  38. if (cpoint < 0x80) /* 1 byte char */
  39. *p++ = cpoint & 0xff;
  40. else {
  41. if (cpoint < 0x800) /* 2 byte char */
  42. *p++ = 0xc0 | (cpoint >> 6);
  43. else {
  44. if (cpoint < 0x10000) /* 3 byte char */
  45. *p++ = 0xe0 | (cpoint >> 12);
  46. else { /* 4 byte char */
  47. *p++ = 0xf0 | (cpoint >> 18);
  48. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  49. }
  50. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  51. }
  52. *p++ = 0x80 | (cpoint & 0x3f);
  53. }
  54. return p - buf;
  55. }
  56. int xs_is_utf8_cont_byte(char c)
  57. /* returns true if c is an utf8 continuation byte */
  58. {
  59. return ((c & 0xc0) == 0x80);
  60. }
  61. unsigned int xs_utf8_dec(const char **str)
  62. /* decodes an utf-8 char inside str and updates the pointer */
  63. {
  64. const char *p = *str;
  65. unsigned int cpoint = 0;
  66. unsigned char c = *p++;
  67. int cb = 0;
  68. if ((c & 0x80) == 0) { /* 1 byte char */
  69. cpoint = c;
  70. }
  71. else
  72. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  73. cpoint = (c & 0x1f) << 6;
  74. cb = 1;
  75. }
  76. else
  77. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  78. cpoint = (c & 0x0f) << 12;
  79. cb = 2;
  80. }
  81. else
  82. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  83. cpoint = (c & 0x07) << 18;
  84. cb = 3;
  85. }
  86. /* process the continuation bytes */
  87. while (cb > 0 && *p && xs_is_utf8_cont_byte(*p))
  88. cpoint |= (*p++ & 0x3f) << (--cb * 6);
  89. /* incomplete or broken? */
  90. if (cb)
  91. cpoint = 0xfffd;
  92. *str = p;
  93. return cpoint;
  94. }
  95. /** Unicode character width: intentionally dead simple **/
  96. static unsigned int xs_unicode_width_table[] = {
  97. 0x300, 0x36f, 0, /* diacritics */
  98. 0x1100, 0x11ff, 2, /* Hangul */
  99. 0x2e80, 0xa4cf, 2, /* CJK */
  100. 0xac00, 0xd7a3, 2, /* more Hangul */
  101. 0xe000, 0xf8ff, 0, /* private use */
  102. 0xf900, 0xfaff, 2, /* CJK compatibility */
  103. 0xff00, 0xff60, 2, /* full width things */
  104. 0xffdf, 0xffe6, 2, /* full width things */
  105. 0x1f200, 0x1ffff, 2, /* emojis */
  106. 0x20000, 0x2fffd, 2 /* more CJK */
  107. };
  108. int xs_unicode_width(unsigned int cpoint)
  109. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  110. {
  111. int b = 0;
  112. int t = xs_countof(xs_unicode_width_table) / 3 - 1;
  113. while (t >= b) {
  114. int n = (b + t) / 2;
  115. unsigned int *p = &xs_unicode_width_table[n * 3];
  116. if (cpoint < p[0])
  117. t = n - 1;
  118. else
  119. if (cpoint > p[1])
  120. b = n + 1;
  121. else
  122. return p[2];
  123. }
  124. return 1;
  125. }
  126. int xs_is_diacritic(unsigned int cpoint)
  127. {
  128. return cpoint >= 0x300 && cpoint <= 0x36f;
  129. }
  130. /** surrogate pairs **/
  131. int xs_is_surrogate(unsigned int cpoint)
  132. /* checks if cpoint is the first element of a Unicode surrogate pair */
  133. {
  134. return cpoint >= 0xd800 && cpoint <= 0xdfff;
  135. }
  136. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
  137. /* "decodes" a surrogate pair into a codepoint */
  138. {
  139. return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
  140. }
  141. unsigned int xs_surrogate_enc(unsigned int cpoint)
  142. /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
  143. {
  144. unsigned int p1 = 0xd7c0 + (cpoint >> 10);
  145. unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
  146. return (p1 << 16) | p2;
  147. }
  148. #ifdef _XS_H
  149. xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
  150. /* encodes an Unicode codepoint to utf-8 into str */
  151. {
  152. char tmp[4];
  153. int c = xs_utf8_enc(tmp, cpoint);
  154. str = xs_insert_m(str, *offset, tmp, c);
  155. *offset += c;
  156. return str;
  157. }
  158. xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
  159. /* encodes an Unicode codepoint to utf-8 into str */
  160. {
  161. int offset = strlen(str);
  162. return xs_utf8_insert(str, cpoint, &offset);
  163. }
  164. #endif /* _XS_H */
  165. #ifdef _XS_UNICODE_TBL_H
  166. /* include xs_unicode_tbl.h before this one to use these functions */
  167. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  168. /* searches for an uppercase codepoint in the case fold table */
  169. {
  170. int b = 0;
  171. int t = xs_countof(xs_unicode_case_fold_table) / 2 + 1;
  172. while (t >= b) {
  173. int n = (b + t) / 2;
  174. unsigned int *p = &xs_unicode_case_fold_table[n * 2];
  175. if (cpoint < p[0])
  176. t = n - 1;
  177. else
  178. if (cpoint > p[0])
  179. b = n + 1;
  180. else
  181. return p;
  182. }
  183. return NULL;
  184. }
  185. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  186. /* searches for a lowercase codepoint in the case fold table */
  187. {
  188. unsigned int *p = xs_unicode_case_fold_table;
  189. unsigned int *e = p + xs_countof(xs_unicode_case_fold_table);
  190. while (p < e) {
  191. if (cpoint == p[1])
  192. return p;
  193. p += 2;
  194. }
  195. return NULL;
  196. }
  197. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  198. /* returns the cpoint to lowercase */
  199. {
  200. if (cpoint < 0x80)
  201. return tolower(cpoint);
  202. unsigned int *p = _xs_unicode_upper_search(cpoint);
  203. return p == NULL ? cpoint : p[1];
  204. }
  205. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  206. /* returns the cpoint to uppercase */
  207. {
  208. if (cpoint < 0x80)
  209. return toupper(cpoint);
  210. unsigned int *p = _xs_unicode_lower_search(cpoint);
  211. return p == NULL ? cpoint : p[0];
  212. }
  213. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  214. /* applies unicode Normalization Form D */
  215. {
  216. int b = 0;
  217. int t = xs_countof(xs_unicode_nfd_table) / 3 - 1;
  218. while (t >= b) {
  219. int n = (b + t) / 2;
  220. unsigned int *p = &xs_unicode_nfd_table[n * 3];
  221. int c = cpoint - p[0];
  222. if (c < 0)
  223. t = n - 1;
  224. else
  225. if (c > 0)
  226. b = n + 1;
  227. else {
  228. *base = p[1];
  229. *diac = p[2];
  230. return 1;
  231. }
  232. }
  233. return 0;
  234. }
  235. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  236. /* applies unicode Normalization Form C */
  237. {
  238. unsigned int *p = xs_unicode_nfd_table;
  239. unsigned int *e = p + xs_countof(xs_unicode_nfd_table);
  240. while (p < e) {
  241. if (p[1] == base && p[2] == diac) {
  242. *cpoint = p[0];
  243. return 1;
  244. }
  245. p += 3;
  246. }
  247. return 0;
  248. }
  249. int xs_unicode_is_alpha(unsigned int cpoint)
  250. /* checks if a codepoint is an alpha (i.e. a letter) */
  251. {
  252. int b = 0;
  253. int t = xs_countof(xs_unicode_alpha_table) / 2 - 1;
  254. while (t >= b) {
  255. int n = (b + t) / 2;
  256. unsigned int *p = &xs_unicode_alpha_table[n * 2];
  257. if (cpoint < p[0])
  258. t = n - 1;
  259. else
  260. if (cpoint > p[1])
  261. b = n + 1;
  262. else
  263. return 1;
  264. }
  265. return 0;
  266. }
  267. #ifdef _XS_H
  268. xs_str *xs_utf8_to_upper(const char *str)
  269. {
  270. xs_str *s = xs_str_new(NULL);
  271. unsigned int cpoint;
  272. int offset = 0;
  273. while ((cpoint = xs_utf8_dec(&str))) {
  274. cpoint = xs_unicode_to_upper(cpoint);
  275. s = xs_utf8_insert(s, cpoint, &offset);
  276. }
  277. return s;
  278. }
  279. xs_str *xs_utf8_to_lower(const char *str)
  280. {
  281. xs_str *s = xs_str_new(NULL);
  282. unsigned int cpoint;
  283. int offset = 0;
  284. while ((cpoint = xs_utf8_dec(&str))) {
  285. cpoint = xs_unicode_to_lower(cpoint);
  286. s = xs_utf8_insert(s, cpoint, &offset);
  287. }
  288. return s;
  289. }
  290. xs_str *xs_utf8_to_nfd(const char *str)
  291. {
  292. xs_str *s = xs_str_new(NULL);
  293. unsigned int cpoint;
  294. int offset = 0;
  295. while ((cpoint = xs_utf8_dec(&str))) {
  296. unsigned int base;
  297. unsigned int diac;
  298. if (xs_unicode_nfd(cpoint, &base, &diac)) {
  299. s = xs_utf8_insert(s, base, &offset);
  300. s = xs_utf8_insert(s, diac, &offset);
  301. }
  302. else
  303. s = xs_utf8_insert(s, cpoint, &offset);
  304. }
  305. return s;
  306. }
  307. xs_str *xs_utf8_to_nfc(const char *str)
  308. {
  309. xs_str *s = xs_str_new(NULL);
  310. unsigned int cpoint;
  311. unsigned int base = 0;
  312. int offset = 0;
  313. while ((cpoint = xs_utf8_dec(&str))) {
  314. if (xs_is_diacritic(cpoint)) {
  315. if (xs_unicode_nfc(base, cpoint, &base))
  316. continue;
  317. }
  318. if (base)
  319. s = xs_utf8_insert(s, base, &offset);
  320. base = cpoint;
  321. }
  322. if (base)
  323. s = xs_utf8_insert(s, base, &offset);
  324. return s;
  325. }
  326. #endif /* _XS_H */
  327. #endif /* _XS_UNICODE_TBL_H */
  328. #endif /* XS_IMPLEMENTATION */
  329. #endif /* _XS_UNICODE_H */