xs_unicode.h 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /* copyright (c) 2022 - 2023 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int _xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
  6. unsigned int xs_utf8_dec(char **str);
  7. int xs_unicode_width(unsigned int cpoint);
  8. int xs_is_surrogate(unsigned int cpoint);
  9. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2);
  10. unsigned int xs_surrogate_enc(unsigned int cpoint);
  11. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  12. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  13. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  14. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  15. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  16. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  17. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  18. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  19. int xs_unicode_is_alpha(unsigned int cpoint);
  20. #ifdef XS_IMPLEMENTATION
  21. int _xs_utf8_enc(char buf[4], unsigned int cpoint)
  22. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  23. {
  24. unsigned char *p = (unsigned char *)buf;
  25. if (cpoint < 0x80) /* 1 byte char */
  26. *p++ = cpoint & 0xff;
  27. else {
  28. if (cpoint < 0x800) /* 2 byte char */
  29. *p++ = 0xc0 | (cpoint >> 6);
  30. else {
  31. if (cpoint < 0x10000) /* 3 byte char */
  32. *p++ = 0xe0 | (cpoint >> 12);
  33. else { /* 4 byte char */
  34. *p++ = 0xf0 | (cpoint >> 18);
  35. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  36. }
  37. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  38. }
  39. *p++ = 0x80 | (cpoint & 0x3f);
  40. }
  41. return p - (unsigned char *)buf;
  42. }
  43. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
  44. /* encodes an Unicode codepoint to utf-8 into str */
  45. {
  46. char tmp[4];
  47. int c = _xs_utf8_enc(tmp, cpoint);
  48. return xs_append_m(str, tmp, c);
  49. }
  50. unsigned int xs_utf8_dec(char **str)
  51. /* decodes an utf-8 char inside str and updates the pointer */
  52. {
  53. unsigned char *p = (unsigned char *)*str;
  54. unsigned int cpoint = 0;
  55. int c = *p++;
  56. int cb = 0;
  57. if ((c & 0x80) == 0) { /* 1 byte char */
  58. cpoint = c;
  59. }
  60. else
  61. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  62. cpoint = (c & 0x1f) << 6;
  63. cb = 1;
  64. }
  65. else
  66. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  67. cpoint = (c & 0x0f) << 12;
  68. cb = 2;
  69. }
  70. else
  71. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  72. cpoint = (c & 0x07) << 18;
  73. cb = 3;
  74. }
  75. /* process the continuation bytes */
  76. while (cb--) {
  77. if ((*p & 0xc0) == 0x80)
  78. cpoint |= (*p++ & 0x3f) << (cb * 6);
  79. else {
  80. cpoint = 0xfffd;
  81. break;
  82. }
  83. }
  84. *str = (char *)p;
  85. return cpoint;
  86. }
  87. static int int_range_cmp(const void *p1, const void *p2)
  88. {
  89. const unsigned int *a = p1;
  90. const unsigned int *b = p2;
  91. return *a < b[0] ? -1 : *a > b[1] ? 1 : 0;
  92. }
  93. /* intentionally dead simple */
  94. static unsigned int xs_unicode_width_table[] = {
  95. 0x300, 0x36f, 0, /* diacritics */
  96. 0x1100, 0x11ff, 2, /* Hangul */
  97. 0x2e80, 0xa4cf, 2, /* CJK */
  98. 0xac00, 0xd7a3, 2, /* more Hangul */
  99. 0xe000, 0xf8ff, 0, /* private use */
  100. 0xf900, 0xfaff, 2, /* CJK compatibility */
  101. 0xff00, 0xff60, 2, /* full width things */
  102. 0xffdf, 0xffe6, 2, /* full width things */
  103. 0x1f200, 0x1ffff, 2, /* emojis */
  104. 0x20000, 0x2fffd, 2 /* more CJK */
  105. };
  106. int xs_unicode_width(unsigned int cpoint)
  107. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  108. {
  109. unsigned int *r = bsearch(&cpoint, xs_unicode_width_table,
  110. sizeof(xs_unicode_width_table) / (sizeof(unsigned int) * 3),
  111. sizeof(unsigned int) * 3,
  112. int_range_cmp);
  113. return r ? r[2] : 1;
  114. }
  115. /** surrogate pairs **/
  116. int xs_is_surrogate(unsigned int cpoint)
  117. /* checks if cpoint is the first element of a Unicode surrogate pair */
  118. {
  119. return cpoint >= 0xd800 && cpoint <= 0xdfff;
  120. }
  121. unsigned int xs_surrogate_dec(unsigned int p1, unsigned int p2)
  122. /* "decodes" a surrogate pair into a codepoint */
  123. {
  124. return 0x10000 | ((p1 & 0x3ff) << 10) | (p2 & 0x3ff);
  125. }
  126. unsigned int xs_surrogate_enc(unsigned int cpoint)
  127. /* "encodes" a Unicode into a surrogate pair (p1 in the MSB word) */
  128. {
  129. unsigned int p1 = 0xd7c0 + (cpoint >> 10);
  130. unsigned int p2 = 0xdc00 + (cpoint & 0x3ff);
  131. return (p1 << 16) | p2;
  132. }
  133. #ifdef _XS_UNICODE_TBL_H
  134. /* include xs_unicode_tbl.h before this one to use these functions */
  135. static int int_cmp(const void *p1, const void *p2)
  136. {
  137. const unsigned int *a = p1;
  138. const unsigned int *b = p2;
  139. return *a < *b ? -1 : *a > *b ? 1 : 0;
  140. }
  141. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  142. /* searches for an uppercase codepoint in the case fold table */
  143. {
  144. return bsearch(&cpoint, xs_unicode_case_fold_table,
  145. sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2),
  146. sizeof(unsigned int) * 2,
  147. int_cmp);
  148. }
  149. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  150. /* searches for a lowercase codepoint in the case fold table */
  151. {
  152. unsigned int *p = xs_unicode_case_fold_table + 1;
  153. unsigned int *e = xs_unicode_case_fold_table +
  154. sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int);
  155. while (p < e) {
  156. if (cpoint == *p)
  157. return p;
  158. p += 2;
  159. }
  160. return NULL;
  161. }
  162. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  163. /* returns the cpoint to uppercase */
  164. {
  165. unsigned int *p = _xs_unicode_lower_search(cpoint);
  166. return p == NULL ? cpoint : p[-1];
  167. }
  168. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  169. /* returns the cpoint to lowercase */
  170. {
  171. unsigned int *p = _xs_unicode_upper_search(cpoint);
  172. return p == NULL ? cpoint : p[1];
  173. }
  174. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  175. /* applies unicode Normalization Form D */
  176. {
  177. unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table,
  178. sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3),
  179. sizeof(unsigned int) * 3,
  180. int_cmp);
  181. if (r != NULL) {
  182. *base = r[1];
  183. *diac = r[2];
  184. }
  185. return !!r;
  186. }
  187. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  188. /* applies unicode Normalization Form C */
  189. {
  190. unsigned int *p = xs_unicode_nfd_table;
  191. unsigned int *e = xs_unicode_nfd_table +
  192. sizeof(xs_unicode_nfd_table) / sizeof(unsigned int);
  193. while (p < e) {
  194. if (p[1] == base && p[2] == diac) {
  195. *cpoint = p[0];
  196. return 1;
  197. }
  198. p += 3;
  199. }
  200. return 0;
  201. }
  202. int xs_unicode_is_alpha(unsigned int cpoint)
  203. /* checks if a codepoint is an alpha (i.e. a letter) */
  204. {
  205. unsigned int *r = bsearch(&cpoint, xs_unicode_alpha_table,
  206. sizeof(xs_unicode_alpha_table) / (sizeof(unsigned int) * 2),
  207. sizeof(unsigned int) * 2,
  208. int_range_cmp);
  209. return !!r;
  210. }
  211. #endif /* _XS_UNICODE_TBL_H */
  212. #endif /* XS_IMPLEMENTATION */
  213. #endif /* _XS_UNICODE_H */