xs_unicode.h 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int _xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. unsigned int xs_utf8_dec(char **str);
  6. int xs_unicode_width(unsigned int cpoint);
  7. int xs_is_surrogate(unsigned int cpoint);
  8. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
  9. unsigned int xs_surrogate_enc(unsigned int cpoint);
  10. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  11. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  12. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  13. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  14. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  15. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  16. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  17. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  18. int xs_unicode_is_alpha(unsigned int cpoint);
  19. #ifdef _XS_H
  20. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
  21. #endif
  22. #ifdef XS_IMPLEMENTATION
  23. #ifndef countof
  24. #define countof(a) (sizeof((a)) / sizeof((*a)))
  25. #endif
  26. int _xs_utf8_enc(char buf[4], unsigned int cpoint)
  27. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  28. {
  29. char *p = buf;
  30. if (cpoint < 0x80) /* 1 byte char */
  31. *p++ = cpoint & 0xff;
  32. else {
  33. if (cpoint < 0x800) /* 2 byte char */
  34. *p++ = 0xc0 | (cpoint >> 6);
  35. else {
  36. if (cpoint < 0x10000) /* 3 byte char */
  37. *p++ = 0xe0 | (cpoint >> 12);
  38. else { /* 4 byte char */
  39. *p++ = 0xf0 | (cpoint >> 18);
  40. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  41. }
  42. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  43. }
  44. *p++ = 0x80 | (cpoint & 0x3f);
  45. }
  46. return p - buf;
  47. }
  48. unsigned int xs_utf8_dec(char **str)
  49. /* decodes an utf-8 char inside str and updates the pointer */
  50. {
  51. char *p = *str;
  52. unsigned int cpoint = 0;
  53. unsigned char c = *p++;
  54. int cb = 0;
  55. if ((c & 0x80) == 0) { /* 1 byte char */
  56. cpoint = c;
  57. }
  58. else
  59. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  60. cpoint = (c & 0x1f) << 6;
  61. cb = 1;
  62. }
  63. else
  64. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  65. cpoint = (c & 0x0f) << 12;
  66. cb = 2;
  67. }
  68. else
  69. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  70. cpoint = (c & 0x07) << 18;
  71. cb = 3;
  72. }
  73. /* process the continuation bytes */
  74. while (cb > 0 && *p && (*p & 0xc0) == 0x80)
  75. cpoint |= (*p++ & 0x3f) << (--cb * 6);
  76. /* incomplete or broken? */
  77. if (cb)
  78. cpoint = 0xfffd;
  79. *str = p;
  80. return cpoint;
  81. }
  82. /** Unicode character width: intentionally dead simple **/
  83. static unsigned int xs_unicode_width_table[] = {
  84. 0x300, 0x36f, 0, /* diacritics */
  85. 0x1100, 0x11ff, 2, /* Hangul */
  86. 0x2e80, 0xa4cf, 2, /* CJK */
  87. 0xac00, 0xd7a3, 2, /* more Hangul */
  88. 0xe000, 0xf8ff, 0, /* private use */
  89. 0xf900, 0xfaff, 2, /* CJK compatibility */
  90. 0xff00, 0xff60, 2, /* full width things */
  91. 0xffdf, 0xffe6, 2, /* full width things */
  92. 0x1f200, 0x1ffff, 2, /* emojis */
  93. 0x20000, 0x2fffd, 2 /* more CJK */
  94. };
  95. int xs_unicode_width(unsigned int cpoint)
  96. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  97. {
  98. int b = 0;
  99. int t = countof(xs_unicode_width_table) / 3 - 1;
  100. while (t >= b) {
  101. int n = (b + t) / 2;
  102. unsigned int *p = &xs_unicode_width_table[n * 3];
  103. if (cpoint < p[0])
  104. t = n - 1;
  105. else
  106. if (cpoint > p[1])
  107. b = n + 1;
  108. else
  109. return p[2];
  110. }
  111. return 1;
  112. }
  113. /** surrogate pairs **/
  114. int xs_is_surrogate(unsigned int cpoint)
  115. /* checks if cpoint is the first element of a Unicode surrogate pair */
  116. {
  117. return cpoint >= 0xd800 && cpoint <= 0xdfff;
  118. }
  119. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
  120. /* "decodes" a surrogate pair into a codepoint */
  121. {
  122. return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
  123. }
  124. unsigned int xs_surrogate_enc(unsigned int cpoint)
  125. /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
  126. {
  127. unsigned int p1 = 0xd7c0 + (cpoint >> 10);
  128. unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
  129. return (p1 << 16) | p2;
  130. }
  131. #ifdef _XS_H
  132. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
  133. /* encodes an Unicode codepoint to utf-8 into str */
  134. {
  135. char tmp[4];
  136. int c = _xs_utf8_enc(tmp, cpoint);
  137. return xs_append_m(str, tmp, c);
  138. }
  139. #endif /* _XS_H */
  140. #ifdef _XS_UNICODE_TBL_H
  141. /* include xs_unicode_tbl.h before this one to use these functions */
  142. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  143. /* searches for an uppercase codepoint in the case fold table */
  144. {
  145. int b = 0;
  146. int t = countof(xs_unicode_case_fold_table) / 2 + 1;
  147. while (t >= b) {
  148. int n = (b + t) / 2;
  149. unsigned int *p = &xs_unicode_case_fold_table[n * 2];
  150. if (cpoint < p[0])
  151. t = n - 1;
  152. else
  153. if (cpoint > p[0])
  154. b = n + 1;
  155. else
  156. return p;
  157. }
  158. return NULL;
  159. }
  160. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  161. /* searches for a lowercase codepoint in the case fold table */
  162. {
  163. unsigned int *p = xs_unicode_case_fold_table;
  164. unsigned int *e = p + countof(xs_unicode_case_fold_table);
  165. while (p < e) {
  166. if (cpoint == p[1])
  167. return p;
  168. p += 2;
  169. }
  170. return NULL;
  171. }
  172. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  173. /* returns the cpoint to lowercase */
  174. {
  175. unsigned int *p = _xs_unicode_upper_search(cpoint);
  176. return p == NULL ? cpoint : p[1];
  177. }
  178. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  179. /* returns the cpoint to uppercase */
  180. {
  181. unsigned int *p = _xs_unicode_lower_search(cpoint);
  182. return p == NULL ? cpoint : p[0];
  183. }
  184. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  185. /* applies unicode Normalization Form D */
  186. {
  187. int b = 0;
  188. int t = countof(xs_unicode_nfd_table) / 3 - 1;
  189. while (t >= b) {
  190. int n = (b + t) / 2;
  191. unsigned int *p = &xs_unicode_nfd_table[n * 3];
  192. int c = cpoint - p[0];
  193. if (c < 0)
  194. t = n - 1;
  195. else
  196. if (c > 0)
  197. b = n + 1;
  198. else {
  199. *base = p[1];
  200. *diac = p[2];
  201. return 1;
  202. }
  203. }
  204. return 0;
  205. }
  206. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  207. /* applies unicode Normalization Form C */
  208. {
  209. unsigned int *p = xs_unicode_nfd_table;
  210. unsigned int *e = p + countof(xs_unicode_nfd_table);
  211. while (p < e) {
  212. if (p[1] == base && p[2] == diac) {
  213. *cpoint = p[0];
  214. return 1;
  215. }
  216. p += 3;
  217. }
  218. return 0;
  219. }
  220. int xs_unicode_is_alpha(unsigned int cpoint)
  221. /* checks if a codepoint is an alpha (i.e. a letter) */
  222. {
  223. int b = 0;
  224. int t = countof(xs_unicode_alpha_table) / 2 - 1;
  225. while (t >= b) {
  226. int n = (b + t) / 2;
  227. unsigned int *p = &xs_unicode_alpha_table[n * 2];
  228. if (cpoint < p[0])
  229. t = n - 1;
  230. else
  231. if (cpoint > p[1])
  232. b = n + 1;
  233. else
  234. return 1;
  235. }
  236. return 0;
  237. }
  238. #endif /* _XS_UNICODE_TBL_H */
  239. #endif /* XS_IMPLEMENTATION */
  240. #endif /* _XS_UNICODE_H */