xs_unicode.h 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. /* copyright (c) 2022 - 2024 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int _xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. int xs_is_utf8_cont_byte(char c);
  6. unsigned int xs_utf8_dec(const char **str);
  7. int xs_unicode_width(unsigned int cpoint);
  8. int xs_is_surrogate(unsigned int cpoint);
  9. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
  10. unsigned int xs_surrogate_enc(unsigned int cpoint);
  11. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  12. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  13. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  14. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  15. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  16. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  17. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  18. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  19. int xs_unicode_is_alpha(unsigned int cpoint);
  20. #ifdef _XS_H
  21. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
  22. #endif
  23. #ifdef XS_IMPLEMENTATION
  24. #ifndef xs_countof
  25. #define xs_countof(a) (sizeof((a)) / sizeof((*a)))
  26. #endif
  27. int _xs_utf8_enc(char buf[4], unsigned int cpoint)
  28. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  29. {
  30. char *p = buf;
  31. if (cpoint < 0x80) /* 1 byte char */
  32. *p++ = cpoint & 0xff;
  33. else {
  34. if (cpoint < 0x800) /* 2 byte char */
  35. *p++ = 0xc0 | (cpoint >> 6);
  36. else {
  37. if (cpoint < 0x10000) /* 3 byte char */
  38. *p++ = 0xe0 | (cpoint >> 12);
  39. else { /* 4 byte char */
  40. *p++ = 0xf0 | (cpoint >> 18);
  41. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  42. }
  43. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  44. }
  45. *p++ = 0x80 | (cpoint & 0x3f);
  46. }
  47. return p - buf;
  48. }
  49. int xs_is_utf8_cont_byte(char c)
  50. /* returns true if c is an utf8 continuation byte */
  51. {
  52. return ((c & 0xc0) == 0x80);
  53. }
  54. unsigned int xs_utf8_dec(const char **str)
  55. /* decodes an utf-8 char inside str and updates the pointer */
  56. {
  57. const char *p = *str;
  58. unsigned int cpoint = 0;
  59. unsigned char c = *p++;
  60. int cb = 0;
  61. if ((c & 0x80) == 0) { /* 1 byte char */
  62. cpoint = c;
  63. }
  64. else
  65. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  66. cpoint = (c & 0x1f) << 6;
  67. cb = 1;
  68. }
  69. else
  70. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  71. cpoint = (c & 0x0f) << 12;
  72. cb = 2;
  73. }
  74. else
  75. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  76. cpoint = (c & 0x07) << 18;
  77. cb = 3;
  78. }
  79. /* process the continuation bytes */
  80. while (cb > 0 && *p && xs_is_utf8_cont_byte(*p))
  81. cpoint |= (*p++ & 0x3f) << (--cb * 6);
  82. /* incomplete or broken? */
  83. if (cb)
  84. cpoint = 0xfffd;
  85. *str = p;
  86. return cpoint;
  87. }
  88. /** Unicode character width: intentionally dead simple **/
  89. static unsigned int xs_unicode_width_table[] = {
  90. 0x300, 0x36f, 0, /* diacritics */
  91. 0x1100, 0x11ff, 2, /* Hangul */
  92. 0x2e80, 0xa4cf, 2, /* CJK */
  93. 0xac00, 0xd7a3, 2, /* more Hangul */
  94. 0xe000, 0xf8ff, 0, /* private use */
  95. 0xf900, 0xfaff, 2, /* CJK compatibility */
  96. 0xff00, 0xff60, 2, /* full width things */
  97. 0xffdf, 0xffe6, 2, /* full width things */
  98. 0x1f200, 0x1ffff, 2, /* emojis */
  99. 0x20000, 0x2fffd, 2 /* more CJK */
  100. };
  101. int xs_unicode_width(unsigned int cpoint)
  102. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  103. {
  104. int b = 0;
  105. int t = xs_countof(xs_unicode_width_table) / 3 - 1;
  106. while (t >= b) {
  107. int n = (b + t) / 2;
  108. unsigned int *p = &xs_unicode_width_table[n * 3];
  109. if (cpoint < p[0])
  110. t = n - 1;
  111. else
  112. if (cpoint > p[1])
  113. b = n + 1;
  114. else
  115. return p[2];
  116. }
  117. return 1;
  118. }
  119. /** surrogate pairs **/
  120. int xs_is_surrogate(unsigned int cpoint)
  121. /* checks if cpoint is the first element of a Unicode surrogate pair */
  122. {
  123. return cpoint >= 0xd800 && cpoint <= 0xdfff;
  124. }
  125. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
  126. /* "decodes" a surrogate pair into a codepoint */
  127. {
  128. return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
  129. }
  130. unsigned int xs_surrogate_enc(unsigned int cpoint)
  131. /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
  132. {
  133. unsigned int p1 = 0xd7c0 + (cpoint >> 10);
  134. unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
  135. return (p1 << 16) | p2;
  136. }
  137. #ifdef _XS_H
  138. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
  139. /* encodes an Unicode codepoint to utf-8 into str */
  140. {
  141. char tmp[4];
  142. int c = _xs_utf8_enc(tmp, cpoint);
  143. return xs_append_m(str, tmp, c);
  144. }
  145. #endif /* _XS_H */
  146. #ifdef _XS_UNICODE_TBL_H
  147. /* include xs_unicode_tbl.h before this one to use these functions */
  148. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  149. /* searches for an uppercase codepoint in the case fold table */
  150. {
  151. int b = 0;
  152. int t = xs_countof(xs_unicode_case_fold_table) / 2 + 1;
  153. while (t >= b) {
  154. int n = (b + t) / 2;
  155. unsigned int *p = &xs_unicode_case_fold_table[n * 2];
  156. if (cpoint < p[0])
  157. t = n - 1;
  158. else
  159. if (cpoint > p[0])
  160. b = n + 1;
  161. else
  162. return p;
  163. }
  164. return NULL;
  165. }
  166. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  167. /* searches for a lowercase codepoint in the case fold table */
  168. {
  169. unsigned int *p = xs_unicode_case_fold_table;
  170. unsigned int *e = p + xs_countof(xs_unicode_case_fold_table);
  171. while (p < e) {
  172. if (cpoint == p[1])
  173. return p;
  174. p += 2;
  175. }
  176. return NULL;
  177. }
  178. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  179. /* returns the cpoint to lowercase */
  180. {
  181. unsigned int *p = _xs_unicode_upper_search(cpoint);
  182. return p == NULL ? cpoint : p[1];
  183. }
  184. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  185. /* returns the cpoint to uppercase */
  186. {
  187. unsigned int *p = _xs_unicode_lower_search(cpoint);
  188. return p == NULL ? cpoint : p[0];
  189. }
  190. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  191. /* applies unicode Normalization Form D */
  192. {
  193. int b = 0;
  194. int t = xs_countof(xs_unicode_nfd_table) / 3 - 1;
  195. while (t >= b) {
  196. int n = (b + t) / 2;
  197. unsigned int *p = &xs_unicode_nfd_table[n * 3];
  198. int c = cpoint - p[0];
  199. if (c < 0)
  200. t = n - 1;
  201. else
  202. if (c > 0)
  203. b = n + 1;
  204. else {
  205. *base = p[1];
  206. *diac = p[2];
  207. return 1;
  208. }
  209. }
  210. return 0;
  211. }
  212. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  213. /* applies unicode Normalization Form C */
  214. {
  215. unsigned int *p = xs_unicode_nfd_table;
  216. unsigned int *e = p + xs_countof(xs_unicode_nfd_table);
  217. while (p < e) {
  218. if (p[1] == base && p[2] == diac) {
  219. *cpoint = p[0];
  220. return 1;
  221. }
  222. p += 3;
  223. }
  224. return 0;
  225. }
  226. int xs_unicode_is_alpha(unsigned int cpoint)
  227. /* checks if a codepoint is an alpha (i.e. a letter) */
  228. {
  229. int b = 0;
  230. int t = xs_countof(xs_unicode_alpha_table) / 2 - 1;
  231. while (t >= b) {
  232. int n = (b + t) / 2;
  233. unsigned int *p = &xs_unicode_alpha_table[n * 2];
  234. if (cpoint < p[0])
  235. t = n - 1;
  236. else
  237. if (cpoint > p[1])
  238. b = n + 1;
  239. else
  240. return 1;
  241. }
  242. return 0;
  243. }
  244. #endif /* _XS_UNICODE_TBL_H */
  245. #endif /* XS_IMPLEMENTATION */
  246. #endif /* _XS_UNICODE_H */