xs_unicode.h 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. int xs_is_utf8_cont_byte(char c);
  6. unsigned int xs_utf8_dec(const char **str);
  7. int xs_unicode_width(unsigned int cpoint);
  8. int xs_is_surrogate(unsigned int cpoint);
  9. int xs_is_diacritic(unsigned int cpoint);
  10. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
  11. unsigned int xs_surrogate_enc(unsigned int cpoint);
  12. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  13. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  14. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  15. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  16. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  17. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  18. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  19. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  20. int xs_unicode_is_alpha(unsigned int cpoint);
  21. #ifdef _XS_H
  22. xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset);
  23. xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint);
  24. xs_str *xs_utf8_to_upper(const char *str);
  25. xs_str *xs_utf8_to_lower(const char *str);
  26. xs_str *xs_utf8_to_nfd(const char *str);
  27. xs_str *xs_utf8_to_nfc(const char *str);
  28. #endif
  29. #ifdef XS_IMPLEMENTATION
  30. #ifndef xs_countof
  31. #define xs_countof(a) (sizeof((a)) / sizeof((*a)))
  32. #endif
  33. int xs_utf8_enc(char buf[4], unsigned int cpoint)
  34. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  35. {
  36. char *p = buf;
  37. if (cpoint < 0x80) /* 1 byte char */
  38. *p++ = cpoint & 0xff;
  39. else {
  40. if (cpoint < 0x800) /* 2 byte char */
  41. *p++ = 0xc0 | (cpoint >> 6);
  42. else {
  43. if (cpoint < 0x10000) /* 3 byte char */
  44. *p++ = 0xe0 | (cpoint >> 12);
  45. else { /* 4 byte char */
  46. *p++ = 0xf0 | (cpoint >> 18);
  47. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  48. }
  49. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  50. }
  51. *p++ = 0x80 | (cpoint & 0x3f);
  52. }
  53. return p - buf;
  54. }
  55. int xs_is_utf8_cont_byte(char c)
  56. /* returns true if c is an utf8 continuation byte */
  57. {
  58. return ((c & 0xc0) == 0x80);
  59. }
  60. unsigned int xs_utf8_dec(const char **str)
  61. /* decodes an utf-8 char inside str and updates the pointer */
  62. {
  63. const char *p = *str;
  64. unsigned int cpoint = 0;
  65. unsigned char c = *p++;
  66. int cb = 0;
  67. if ((c & 0x80) == 0) { /* 1 byte char */
  68. cpoint = c;
  69. }
  70. else
  71. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  72. cpoint = (c & 0x1f) << 6;
  73. cb = 1;
  74. }
  75. else
  76. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  77. cpoint = (c & 0x0f) << 12;
  78. cb = 2;
  79. }
  80. else
  81. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  82. cpoint = (c & 0x07) << 18;
  83. cb = 3;
  84. }
  85. /* process the continuation bytes */
  86. while (cb > 0 && *p && xs_is_utf8_cont_byte(*p))
  87. cpoint |= (*p++ & 0x3f) << (--cb * 6);
  88. /* incomplete or broken? */
  89. if (cb)
  90. cpoint = 0xfffd;
  91. *str = p;
  92. return cpoint;
  93. }
  94. /** Unicode character width: intentionally dead simple **/
  95. static unsigned int xs_unicode_width_table[] = {
  96. 0x300, 0x36f, 0, /* diacritics */
  97. 0x1100, 0x11ff, 2, /* Hangul */
  98. 0x2e80, 0xa4cf, 2, /* CJK */
  99. 0xac00, 0xd7a3, 2, /* more Hangul */
  100. 0xe000, 0xf8ff, 0, /* private use */
  101. 0xf900, 0xfaff, 2, /* CJK compatibility */
  102. 0xff00, 0xff60, 2, /* full width things */
  103. 0xffdf, 0xffe6, 2, /* full width things */
  104. 0x1f200, 0x1ffff, 2, /* emojis */
  105. 0x20000, 0x2fffd, 2 /* more CJK */
  106. };
  107. int xs_unicode_width(unsigned int cpoint)
  108. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  109. {
  110. int b = 0;
  111. int t = xs_countof(xs_unicode_width_table) / 3 - 1;
  112. while (t >= b) {
  113. int n = (b + t) / 2;
  114. unsigned int *p = &xs_unicode_width_table[n * 3];
  115. if (cpoint < p[0])
  116. t = n - 1;
  117. else
  118. if (cpoint > p[1])
  119. b = n + 1;
  120. else
  121. return p[2];
  122. }
  123. return 1;
  124. }
  125. int xs_is_diacritic(unsigned int cpoint)
  126. {
  127. return cpoint >= 0x300 && cpoint <= 0x36f;
  128. }
  129. /** surrogate pairs **/
  130. int xs_is_surrogate(unsigned int cpoint)
  131. /* checks if cpoint is the first element of a Unicode surrogate pair */
  132. {
  133. return cpoint >= 0xd800 && cpoint <= 0xdfff;
  134. }
  135. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
  136. /* "decodes" a surrogate pair into a codepoint */
  137. {
  138. return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
  139. }
  140. unsigned int xs_surrogate_enc(unsigned int cpoint)
  141. /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
  142. {
  143. unsigned int p1 = 0xd7c0 + (cpoint >> 10);
  144. unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
  145. return (p1 << 16) | p2;
  146. }
  147. #ifdef _XS_H
  148. xs_str *xs_utf8_insert(xs_str *str, unsigned int cpoint, int *offset)
  149. /* encodes an Unicode codepoint to utf-8 into str */
  150. {
  151. char tmp[4];
  152. int c = xs_utf8_enc(tmp, cpoint);
  153. str = xs_insert_m(str, *offset, tmp, c);
  154. *offset += c;
  155. return str;
  156. }
  157. xs_str *xs_utf8_cat(xs_str *str, unsigned int cpoint)
  158. /* encodes an Unicode codepoint to utf-8 into str */
  159. {
  160. int offset = strlen(str);
  161. return xs_utf8_insert(str, cpoint, &offset);
  162. }
  163. #endif /* _XS_H */
  164. #ifdef _XS_UNICODE_TBL_H
  165. /* include xs_unicode_tbl.h before this one to use these functions */
  166. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  167. /* searches for an uppercase codepoint in the case fold table */
  168. {
  169. int b = 0;
  170. int t = xs_countof(xs_unicode_case_fold_table) / 2 + 1;
  171. while (t >= b) {
  172. int n = (b + t) / 2;
  173. unsigned int *p = &xs_unicode_case_fold_table[n * 2];
  174. if (cpoint < p[0])
  175. t = n - 1;
  176. else
  177. if (cpoint > p[0])
  178. b = n + 1;
  179. else
  180. return p;
  181. }
  182. return NULL;
  183. }
  184. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  185. /* searches for a lowercase codepoint in the case fold table */
  186. {
  187. unsigned int *p = xs_unicode_case_fold_table;
  188. unsigned int *e = p + xs_countof(xs_unicode_case_fold_table);
  189. while (p < e) {
  190. if (cpoint == p[1])
  191. return p;
  192. p += 2;
  193. }
  194. return NULL;
  195. }
  196. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  197. /* returns the cpoint to lowercase */
  198. {
  199. if (cpoint < 0x80)
  200. return tolower(cpoint);
  201. unsigned int *p = _xs_unicode_upper_search(cpoint);
  202. return p == NULL ? cpoint : p[1];
  203. }
  204. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  205. /* returns the cpoint to uppercase */
  206. {
  207. if (cpoint < 0x80)
  208. return toupper(cpoint);
  209. unsigned int *p = _xs_unicode_lower_search(cpoint);
  210. return p == NULL ? cpoint : p[0];
  211. }
  212. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  213. /* applies unicode Normalization Form D */
  214. {
  215. int b = 0;
  216. int t = xs_countof(xs_unicode_nfd_table) / 3 - 1;
  217. while (t >= b) {
  218. int n = (b + t) / 2;
  219. unsigned int *p = &xs_unicode_nfd_table[n * 3];
  220. int c = cpoint - p[0];
  221. if (c < 0)
  222. t = n - 1;
  223. else
  224. if (c > 0)
  225. b = n + 1;
  226. else {
  227. *base = p[1];
  228. *diac = p[2];
  229. return 1;
  230. }
  231. }
  232. return 0;
  233. }
  234. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  235. /* applies unicode Normalization Form C */
  236. {
  237. unsigned int *p = xs_unicode_nfd_table;
  238. unsigned int *e = p + xs_countof(xs_unicode_nfd_table);
  239. while (p < e) {
  240. if (p[1] == base && p[2] == diac) {
  241. *cpoint = p[0];
  242. return 1;
  243. }
  244. p += 3;
  245. }
  246. return 0;
  247. }
  248. int xs_unicode_is_alpha(unsigned int cpoint)
  249. /* checks if a codepoint is an alpha (i.e. a letter) */
  250. {
  251. int b = 0;
  252. int t = xs_countof(xs_unicode_alpha_table) / 2 - 1;
  253. while (t >= b) {
  254. int n = (b + t) / 2;
  255. unsigned int *p = &xs_unicode_alpha_table[n * 2];
  256. if (cpoint < p[0])
  257. t = n - 1;
  258. else
  259. if (cpoint > p[1])
  260. b = n + 1;
  261. else
  262. return 1;
  263. }
  264. return 0;
  265. }
  266. #ifdef _XS_H
  267. xs_str *xs_utf8_to_upper(const char *str)
  268. {
  269. xs_str *s = xs_str_new(NULL);
  270. unsigned int cpoint;
  271. int offset = 0;
  272. while ((cpoint = xs_utf8_dec(&str))) {
  273. cpoint = xs_unicode_to_upper(cpoint);
  274. s = xs_utf8_insert(s, cpoint, &offset);
  275. }
  276. return s;
  277. }
  278. xs_str *xs_utf8_to_lower(const char *str)
  279. {
  280. xs_str *s = xs_str_new(NULL);
  281. unsigned int cpoint;
  282. int offset = 0;
  283. while ((cpoint = xs_utf8_dec(&str))) {
  284. cpoint = xs_unicode_to_lower(cpoint);
  285. s = xs_utf8_insert(s, cpoint, &offset);
  286. }
  287. return s;
  288. }
  289. xs_str *xs_utf8_to_nfd(const char *str)
  290. {
  291. xs_str *s = xs_str_new(NULL);
  292. unsigned int cpoint;
  293. int offset = 0;
  294. while ((cpoint = xs_utf8_dec(&str))) {
  295. unsigned int base;
  296. unsigned int diac;
  297. if (xs_unicode_nfd(cpoint, &base, &diac)) {
  298. s = xs_utf8_insert(s, base, &offset);
  299. s = xs_utf8_insert(s, diac, &offset);
  300. }
  301. else
  302. s = xs_utf8_insert(s, cpoint, &offset);
  303. }
  304. return s;
  305. }
  306. xs_str *xs_utf8_to_nfc(const char *str)
  307. {
  308. xs_str *s = xs_str_new(NULL);
  309. unsigned int cpoint;
  310. unsigned int base = 0;
  311. int offset = 0;
  312. while ((cpoint = xs_utf8_dec(&str))) {
  313. if (xs_is_diacritic(cpoint)) {
  314. if (xs_unicode_nfc(base, cpoint, &base))
  315. continue;
  316. }
  317. if (base)
  318. s = xs_utf8_insert(s, base, &offset);
  319. base = cpoint;
  320. }
  321. if (base)
  322. s = xs_utf8_insert(s, base, &offset);
  323. return s;
  324. }
  325. #endif /* _XS_H */
  326. #endif /* _XS_UNICODE_TBL_H */
  327. #endif /* XS_IMPLEMENTATION */
  328. #endif /* _XS_UNICODE_H */