xs_unicode.h 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. /* copyright (c) 2022 - 2023 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. int _xs_utf8_enc(char buf[4], unsigned int cpoint);
  5. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
  6. unsigned int xs_utf8_dec(char **str);
  7. int xs_unicode_width(unsigned int cpoint);
  8. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  9. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  10. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  11. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  12. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  13. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  14. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  15. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  16. #ifdef XS_IMPLEMENTATION
  17. int _xs_utf8_enc(char buf[4], unsigned int cpoint)
  18. /* encodes an Unicode codepoint to utf-8 into buf and returns the size in bytes */
  19. {
  20. unsigned char *p = (unsigned char *)buf;
  21. if (cpoint < 0x80) /* 1 byte char */
  22. *p++ = cpoint & 0xff;
  23. else {
  24. if (cpoint < 0x800) /* 2 byte char */
  25. *p++ = 0xc0 | (cpoint >> 6);
  26. else {
  27. if (cpoint < 0x10000) /* 3 byte char */
  28. *p++ = 0xe0 | (cpoint >> 12);
  29. else { /* 4 byte char */
  30. *p++ = 0xf0 | (cpoint >> 18);
  31. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  32. }
  33. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  34. }
  35. *p++ = 0x80 | (cpoint & 0x3f);
  36. }
  37. return p - (unsigned char *)buf;
  38. }
  39. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
  40. /* encodes an Unicode codepoint to utf-8 into str */
  41. {
  42. char tmp[4];
  43. int c = _xs_utf8_enc(tmp, cpoint);
  44. return xs_append_m(str, tmp, c);
  45. }
  46. unsigned int xs_utf8_dec(char **str)
  47. /* decodes an utf-8 char inside str and updates the pointer */
  48. {
  49. unsigned char *p = (unsigned char *)*str;
  50. unsigned int cpoint = 0;
  51. int c = *p++;
  52. int cb = 0;
  53. if ((c & 0x80) == 0) { /* 1 byte char */
  54. cpoint = c;
  55. }
  56. else
  57. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  58. cpoint = (c & 0x1f) << 6;
  59. cb = 1;
  60. }
  61. else
  62. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  63. cpoint = (c & 0x0f) << 12;
  64. cb = 2;
  65. }
  66. else
  67. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  68. cpoint = (c & 0x07) << 18;
  69. cb = 3;
  70. }
  71. /* process the continuation bytes */
  72. while (cb--) {
  73. if ((*p & 0xc0) == 0x80)
  74. cpoint |= (*p++ & 0x3f) << (cb * 6);
  75. else {
  76. cpoint = 0xfffd;
  77. break;
  78. }
  79. }
  80. *str = (char *)p;
  81. return cpoint;
  82. }
  83. /* intentionally dead simple */
  84. static unsigned int xs_unicode_width_table[] = {
  85. 0x300, 0x36f, 0, /* diacritics */
  86. 0x1100, 0x11ff, 2, /* Hangul */
  87. 0x2e80, 0xa4cf, 2, /* CJK */
  88. 0xac00, 0xd7a3, 2, /* more Hangul */
  89. 0xe000, 0xf8ff, 0, /* private use */
  90. 0xf900, 0xfaff, 2, /* CJK compatibility */
  91. 0xff00, 0xff60, 2, /* full width things */
  92. 0xffdf, 0xffe6, 2, /* full width things */
  93. 0x1f200, 0x1ffff, 2, /* emojis */
  94. 0x20000, 0x2fffd, 2 /* more CJK */
  95. };
  96. int xs_unicode_width(unsigned int cpoint)
  97. /* returns the width in columns of a Unicode codepoint (somewhat simplified) */
  98. {
  99. unsigned int *p = xs_unicode_width_table;
  100. unsigned int *e = p + sizeof(xs_unicode_width_table) / sizeof(unsigned int);
  101. while (p < e) {
  102. if (cpoint < p[0])
  103. return 1;
  104. if (cpoint >= p[0] && cpoint <= p[1])
  105. return p[2];
  106. p += 3;
  107. }
  108. return 0;
  109. }
  110. #ifdef _XS_UNICODE_TBL_H
  111. /* include xs_unicode_tbl.h before this one to use these functions */
  112. static int int_cmp(const void *p1, const void *p2)
  113. {
  114. const unsigned int *a = p1;
  115. const unsigned int *b = p2;
  116. return *a < *b ? -1 : *a > *b ? 1 : 0;
  117. }
  118. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  119. /* searches for an uppercase codepoint in the case fold table */
  120. {
  121. return bsearch(&cpoint, xs_unicode_case_fold_table,
  122. sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2),
  123. sizeof(unsigned int) * 2,
  124. int_cmp);
  125. }
  126. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  127. /* searches for a lowercase codepoint in the case fold table */
  128. {
  129. unsigned int *p = xs_unicode_case_fold_table + 1;
  130. unsigned int *e = xs_unicode_case_fold_table +
  131. sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int);
  132. while (p < e) {
  133. if (cpoint == *p)
  134. return p;
  135. p += 2;
  136. }
  137. return NULL;
  138. }
  139. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  140. /* returns the cpoint to uppercase */
  141. {
  142. unsigned int *p = _xs_unicode_lower_search(cpoint);
  143. return p == NULL ? cpoint : p[-1];
  144. }
  145. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  146. /* returns the cpoint to lowercase */
  147. {
  148. unsigned int *p = _xs_unicode_upper_search(cpoint);
  149. return p == NULL ? cpoint : p[1];
  150. }
  151. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  152. /* applies unicode Normalization Form D */
  153. {
  154. unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table,
  155. sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3),
  156. sizeof(unsigned int) * 3,
  157. int_cmp);
  158. if (r != NULL) {
  159. *base = r[1];
  160. *diac = r[2];
  161. }
  162. return !!r;
  163. }
  164. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  165. /* applies unicode Normalization Form C */
  166. {
  167. unsigned int *p = xs_unicode_nfd_table;
  168. unsigned int *e = xs_unicode_nfd_table +
  169. sizeof(xs_unicode_nfd_table) / sizeof(unsigned int);
  170. while (p < e) {
  171. if (p[1] == base && p[2] == diac) {
  172. *cpoint = p[0];
  173. return 1;
  174. }
  175. p += 3;
  176. }
  177. return 0;
  178. }
  179. #endif /* _XS_UNICODE_TBL_H */
  180. #endif /* XS_IMPLEMENTATION */
  181. #endif /* _XS_UNICODE_H */