xs_unicode.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. int xs_is_utf8_cont_byte(char c);
  6. unsigned int xs_utf8_dec(const char **str);
  7. int xs_unicode_width(unsigned int cpoint);
  8. int xs_is_surrogate(unsigned int cpoint);
  9. int xs_is_diacritic(unsigned int cpoint);
  10. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
  11. unsigned int xs_surrogate_enc(unsigned int cpoint);
  12. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  13. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  14. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  15. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  16. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  17. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  18. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  19. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  20. int xs_unicode_is_alpha(unsigned int cpoint);
  21. int xs_unicode_is_right_to_left(unsigned int cpoint);
  22. #ifdef _XS_H
  23. xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
  24. xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
  25. xs_str *xs_utf8_to_upper(const char *str);
  26. xs_str *xs_utf8_to_lower(const char *str);
  27. xs_str *xs_utf8_to_nfd(const char *str);
  28. xs_str *xs_utf8_to_nfc(const char *str);
  29. #endif
  30. #ifdef XS_IMPLEMENTATION
  31. #include <ctype.h>
  32. #ifndef xs_countof
  33. #define xs_countof(a) (sizeof((a)) / sizeof((*a)))
  34. #endif
  35. int xs_utf8_enc(char buf[4], unsigned int cpoint)
  36. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  37. {
  38. char *p = buf;
  39. if (cpoint < 0x80) /* 1 byte char */
  40. *p++ = cpoint & 0xff;
  41. else {
  42. if (cpoint < 0x800) /* 2 byte char */
  43. *p++ = 0xc0 | (cpoint >> 6);
  44. else {
  45. if (cpoint < 0x10000) /* 3 byte char */
  46. *p++ = 0xe0 | (cpoint >> 12);
  47. else { /* 4 byte char */
  48. *p++ = 0xf0 | (cpoint >> 18);
  49. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  50. }
  51. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  52. }
  53. *p++ = 0x80 | (cpoint & 0x3f);
  54. }
  55. return p - buf;
  56. }
  57. int xs_is_utf8_cont_byte(char c)
  58. /* returns true if c is an utf8 continuation byte */
  59. {
  60. return ((c & 0xc0) == 0x80);
  61. }
  62. unsigned int xs_utf8_dec(const char **str)
  63. /* decodes an utf-8 char inside str and updates the pointer */
  64. {
  65. const char *p = *str;
  66. unsigned int cpoint = 0;
  67. unsigned char c = *p++;
  68. int cb = 0;
  69. if ((c & 0x80) == 0) { /* 1 byte char */
  70. cpoint = c;
  71. }
  72. else
  73. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  74. cpoint = (c & 0x1f) << 6;
  75. cb = 1;
  76. }
  77. else
  78. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  79. cpoint = (c & 0x0f) << 12;
  80. cb = 2;
  81. }
  82. else
  83. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  84. cpoint = (c & 0x07) << 18;
  85. cb = 3;
  86. }
  87. /* process the continuation bytes */
  88. while (cb > 0 && *p && xs_is_utf8_cont_byte(*p))
  89. cpoint |= (*p++ & 0x3f) << (--cb * 6);
  90. /* incomplete or broken? */
  91. if (cb)
  92. cpoint = 0xfffd;
  93. *str = p;
  94. return cpoint;
  95. }
  96. /** Unicode character width: intentionally dead simple **/
  97. static unsigned int xs_unicode_width_table[] = {
  98. 0x300, 0x36f, 0, /* diacritics */
  99. 0x1100, 0x11ff, 2, /* Hangul */
  100. 0x2e80, 0xa4cf, 2, /* CJK */
  101. 0xac00, 0xd7a3, 2, /* more Hangul */
  102. 0xe000, 0xf8ff, 0, /* private use */
  103. 0xf900, 0xfaff, 2, /* CJK compatibility */
  104. 0xff00, 0xff60, 2, /* full width things */
  105. 0xffdf, 0xffe6, 2, /* full width things */
  106. 0x1f200, 0x1ffff, 2, /* emojis */
  107. 0x20000, 0x2fffd, 2 /* more CJK */
  108. };
  109. int xs_unicode_width(unsigned int cpoint)
  110. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  111. {
  112. int b = 0;
  113. int t = xs_countof(xs_unicode_width_table) / 3 - 1;
  114. while (t >= b) {
  115. int n = (b + t) / 2;
  116. unsigned int *p = &xs_unicode_width_table[n * 3];
  117. if (cpoint < p[0])
  118. t = n - 1;
  119. else
  120. if (cpoint > p[1])
  121. b = n + 1;
  122. else
  123. return p[2];
  124. }
  125. return 1;
  126. }
  127. int xs_is_diacritic(unsigned int cpoint)
  128. {
  129. return cpoint >= 0x300 && cpoint <= 0x36f;
  130. }
  131. /** surrogate pairs **/
  132. int xs_is_surrogate(unsigned int cpoint)
  133. /* checks if cpoint is the first element of a Unicode surrogate pair */
  134. {
  135. return cpoint >= 0xd800 && cpoint <= 0xdfff;
  136. }
  137. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
  138. /* "decodes" a surrogate pair into a codepoint */
  139. {
  140. return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
  141. }
  142. unsigned int xs_surrogate_enc(unsigned int cpoint)
  143. /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
  144. {
  145. unsigned int p1 = 0xd7c0 + (cpoint >> 10);
  146. unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
  147. return (p1 << 16) | p2;
  148. }
  149. #ifdef _XS_H
  150. xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
  151. /* encodes an Unicode codepoint to utf-8 into str */
  152. {
  153. char tmp[4];
  154. int c = xs_utf8_enc(tmp, cpoint);
  155. str = xs_insert_m(str, *offset, tmp, c);
  156. *offset += c;
  157. return str;
  158. }
  159. xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
  160. /* encodes an Unicode codepoint to utf-8 into str */
  161. {
  162. int offset = strlen(str);
  163. return xs_utf8_insert(str, cpoint, &offset);
  164. }
  165. #endif /* _XS_H */
  166. #ifdef _XS_UNICODE_TBL_H
  167. /* include xs_unicode_tbl.h before this one to use these functions */
  168. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  169. /* searches for an uppercase codepoint in the case fold table */
  170. {
  171. int b = 0;
  172. int t = xs_countof(xs_unicode_case_fold_table) / 2 + 1;
  173. while (t >= b) {
  174. int n = (b + t) / 2;
  175. unsigned int *p = &xs_unicode_case_fold_table[n * 2];
  176. if (cpoint < p[0])
  177. t = n - 1;
  178. else
  179. if (cpoint > p[0])
  180. b = n + 1;
  181. else
  182. return p;
  183. }
  184. return NULL;
  185. }
  186. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  187. /* searches for a lowercase codepoint in the case fold table */
  188. {
  189. unsigned int *p = xs_unicode_case_fold_table;
  190. unsigned int *e = p + xs_countof(xs_unicode_case_fold_table);
  191. while (p < e) {
  192. if (cpoint == p[1])
  193. return p;
  194. p += 2;
  195. }
  196. return NULL;
  197. }
  198. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  199. /* returns the cpoint to lowercase */
  200. {
  201. if (cpoint < 0x80)
  202. return tolower(cpoint);
  203. unsigned int *p = _xs_unicode_upper_search(cpoint);
  204. return p == NULL ? cpoint : p[1];
  205. }
  206. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  207. /* returns the cpoint to uppercase */
  208. {
  209. if (cpoint < 0x80)
  210. return toupper(cpoint);
  211. unsigned int *p = _xs_unicode_lower_search(cpoint);
  212. return p == NULL ? cpoint : p[0];
  213. }
  214. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  215. /* applies unicode Normalization Form D */
  216. {
  217. int b = 0;
  218. int t = xs_countof(xs_unicode_nfd_table) / 3 - 1;
  219. while (t >= b) {
  220. int n = (b + t) / 2;
  221. unsigned int *p = &xs_unicode_nfd_table[n * 3];
  222. int c = cpoint - p[0];
  223. if (c < 0)
  224. t = n - 1;
  225. else
  226. if (c > 0)
  227. b = n + 1;
  228. else {
  229. *base = p[1];
  230. *diac = p[2];
  231. return 1;
  232. }
  233. }
  234. return 0;
  235. }
  236. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  237. /* applies unicode Normalization Form C */
  238. {
  239. unsigned int *p = xs_unicode_nfd_table;
  240. unsigned int *e = p + xs_countof(xs_unicode_nfd_table);
  241. while (p < e) {
  242. if (p[1] == base && p[2] == diac) {
  243. *cpoint = p[0];
  244. return 1;
  245. }
  246. p += 3;
  247. }
  248. return 0;
  249. }
  250. int xs_unicode_is_alpha(unsigned int cpoint)
  251. /* checks if a codepoint is an alpha (i.e. a letter) */
  252. {
  253. int b = 0;
  254. int t = xs_countof(xs_unicode_alpha_table) / 2 - 1;
  255. while (t >= b) {
  256. int n = (b + t) / 2;
  257. unsigned int *p = &xs_unicode_alpha_table[n * 2];
  258. if (cpoint < p[0])
  259. t = n - 1;
  260. else
  261. if (cpoint > p[1])
  262. b = n + 1;
  263. else
  264. return 1;
  265. }
  266. return 0;
  267. }
  268. int xs_unicode_is_right_to_left(unsigned int cpoint)
  269. /* checks if a codepoint is a right-to-left letter */
  270. {
  271. int b = 0;
  272. int t = xs_countof(xs_unicode_right_to_left_table) / 2 - 1;
  273. while (t >= b) {
  274. int n = (b + t) / 2;
  275. unsigned int *p = &xs_unicode_right_to_left_table[n * 2];
  276. if (cpoint < p[0])
  277. t = n - 1;
  278. else
  279. if (cpoint > p[1])
  280. b = n + 1;
  281. else
  282. return 1;
  283. }
  284. return 0;
  285. }
  286. #ifdef _XS_H
  287. xs_str *xs_utf8_to_upper(const char *str)
  288. {
  289. xs_str *s = xs_str_new(NULL);
  290. unsigned int cpoint;
  291. int offset = 0;
  292. while ((cpoint = xs_utf8_dec(&str))) {
  293. cpoint = xs_unicode_to_upper(cpoint);
  294. s = xs_utf8_insert(s, cpoint, &offset);
  295. }
  296. return s;
  297. }
  298. xs_str *xs_utf8_to_lower(const char *str)
  299. {
  300. xs_str *s = xs_str_new(NULL);
  301. unsigned int cpoint;
  302. int offset = 0;
  303. while ((cpoint = xs_utf8_dec(&str))) {
  304. cpoint = xs_unicode_to_lower(cpoint);
  305. s = xs_utf8_insert(s, cpoint, &offset);
  306. }
  307. return s;
  308. }
  309. xs_str *xs_utf8_to_nfd(const char *str)
  310. {
  311. xs_str *s = xs_str_new(NULL);
  312. unsigned int cpoint;
  313. int offset = 0;
  314. while ((cpoint = xs_utf8_dec(&str))) {
  315. unsigned int base;
  316. unsigned int diac;
  317. if (xs_unicode_nfd(cpoint, &base, &diac)) {
  318. s = xs_utf8_insert(s, base, &offset);
  319. s = xs_utf8_insert(s, diac, &offset);
  320. }
  321. else
  322. s = xs_utf8_insert(s, cpoint, &offset);
  323. }
  324. return s;
  325. }
  326. xs_str *xs_utf8_to_nfc(const char *str)
  327. {
  328. xs_str *s = xs_str_new(NULL);
  329. unsigned int cpoint;
  330. unsigned int base = 0;
  331. int offset = 0;
  332. while ((cpoint = xs_utf8_dec(&str))) {
  333. if (xs_is_diacritic(cpoint)) {
  334. if (xs_unicode_nfc(base, cpoint, &base))
  335. continue;
  336. }
  337. if (base)
  338. s = xs_utf8_insert(s, base, &offset);
  339. base = cpoint;
  340. }
  341. if (base)
  342. s = xs_utf8_insert(s, base, &offset);
  343. return s;
  344. }
  345. #endif /* _XS_H */
  346. #endif /* _XS_UNICODE_TBL_H */
  347. #endif /* XS_IMPLEMENTATION */
  348. #endif /* _XS_UNICODE_H */