xs_unicode.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /* copyright (c) 2022 - 2023 grunfink et al. / MIT license */
  2. #ifndef _XS_UNICODE_H
  3. #define _XS_UNICODE_H
  4. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint);
  5. unsigned int xs_utf8_dec(char **str);
  6. unsigned int *_xs_unicode_upper_search(unsigned int cpoint);
  7. unsigned int *_xs_unicode_lower_search(unsigned int cpoint);
  8. #define xs_unicode_is_upper(cpoint) (!!_xs_unicode_upper_search(cpoint))
  9. #define xs_unicode_is_lower(cpoint) (!!_xs_unicode_lower_search(cpoint))
  10. unsigned int xs_unicode_to_upper(unsigned int cpoint);
  11. unsigned int xs_unicode_to_lower(unsigned int cpoint);
  12. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac);
  13. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint);
  14. #ifdef XS_IMPLEMENTATION
  15. char *_xs_utf8_enc(char buf[4], unsigned int cpoint)
  16. /* encodes an Unicode codepoint to utf-8 into buf and returns the new position */
  17. {
  18. unsigned char *p = (unsigned char *)buf;
  19. if (cpoint < 0x80) /* 1 byte char */
  20. *p++ = cpoint & 0xff;
  21. else {
  22. if (cpoint < 0x800) /* 2 byte char */
  23. *p++ = 0xc0 | (cpoint >> 6);
  24. else {
  25. if (cpoint < 0x10000) /* 3 byte char */
  26. *p++ = 0xe0 | (cpoint >> 12);
  27. else { /* 4 byte char */
  28. *p++ = 0xf0 | (cpoint >> 18);
  29. *p++ = 0x80 | ((cpoint >> 12) & 0x3f);
  30. }
  31. *p++ = 0x80 | ((cpoint >> 6) & 0x3f);
  32. }
  33. *p++ = 0x80 | (cpoint & 0x3f);
  34. }
  35. return (char *)p;
  36. }
  37. xs_str *xs_utf8_enc(xs_str *str, unsigned int cpoint)
  38. /* encodes an Unicode codepoint to utf-8 into str */
  39. {
  40. char tmp[4], *p;
  41. p = _xs_utf8_enc(tmp, cpoint);
  42. return xs_append_m(str, tmp, p - tmp);
  43. }
  44. unsigned int xs_utf8_dec(char **str)
  45. /* decodes an utf-8 char inside str and updates the pointer */
  46. {
  47. unsigned char *p = (unsigned char *)*str;
  48. unsigned int cpoint = 0;
  49. int c = *p++;
  50. int cb = 0;
  51. if ((c & 0x80) == 0) { /* 1 byte char */
  52. cpoint = c;
  53. }
  54. else
  55. if ((c & 0xe0) == 0xc0) { /* 2 byte char */
  56. cpoint = (c & 0x1f) << 6;
  57. cb = 1;
  58. }
  59. else
  60. if ((c & 0xf0) == 0xe0) { /* 3 byte char */
  61. cpoint = (c & 0x0f) << 12;
  62. cb = 2;
  63. }
  64. else
  65. if ((c & 0xf8) == 0xf0) { /* 4 byte char */
  66. cpoint = (c & 0x07) << 18;
  67. cb = 3;
  68. }
  69. /* process the continuation bytes */
  70. while (cb--) {
  71. if ((*p & 0xc0) == 0x80)
  72. cpoint |= (*p++ & 0x3f) << (cb * 6);
  73. else {
  74. cpoint = 0xfffd;
  75. break;
  76. }
  77. }
  78. *str = (char *)p;
  79. return cpoint;
  80. }
  81. #ifdef _XS_UNICODE_TBL_H
  82. /* include xs_unicode_tbl.h before to use these functions */
  83. static int int_cmp(const void *p1, const void *p2)
  84. {
  85. const unsigned int *a = p1;
  86. const unsigned int *b = p2;
  87. return *a < *b ? -1 : *a > *b ? 1 : 0;
  88. }
  89. unsigned int *_xs_unicode_upper_search(unsigned int cpoint)
  90. /* searches for an uppercase codepoint in the case fold table */
  91. {
  92. return bsearch(&cpoint, xs_unicode_case_fold_table,
  93. sizeof(xs_unicode_case_fold_table) / (sizeof(unsigned int) * 2),
  94. sizeof(unsigned int) * 2,
  95. int_cmp);
  96. }
  97. unsigned int *_xs_unicode_lower_search(unsigned int cpoint)
  98. /* searches for a lowercase codepoint in the case fold table */
  99. {
  100. unsigned int *p = xs_unicode_case_fold_table + 1;
  101. unsigned int *e = xs_unicode_case_fold_table +
  102. sizeof(xs_unicode_case_fold_table) / sizeof(unsigned int);
  103. while (p < e) {
  104. if (cpoint == *p)
  105. return p;
  106. p += 2;
  107. }
  108. return NULL;
  109. }
  110. unsigned int xs_unicode_to_upper(unsigned int cpoint)
  111. /* returns the cpoint to uppercase */
  112. {
  113. unsigned int *p = _xs_unicode_lower_search(cpoint);
  114. return p == NULL ? cpoint : p[-1];
  115. }
  116. unsigned int xs_unicode_to_lower(unsigned int cpoint)
  117. /* returns the cpoint to lowercase */
  118. {
  119. unsigned int *p = _xs_unicode_upper_search(cpoint);
  120. return p == NULL ? cpoint : p[1];
  121. }
  122. int xs_unicode_nfd(unsigned int cpoint, unsigned int *base, unsigned int *diac)
  123. /* applies unicode Normalization Form D */
  124. {
  125. unsigned int *r = bsearch(&cpoint, xs_unicode_nfd_table,
  126. sizeof(xs_unicode_nfd_table) / (sizeof(unsigned int) * 3),
  127. sizeof(unsigned int) * 3,
  128. int_cmp);
  129. if (r != NULL) {
  130. *base = r[1];
  131. *diac = r[2];
  132. }
  133. return !!r;
  134. }
  135. int xs_unicode_nfc(unsigned int base, unsigned int diac, unsigned int *cpoint)
  136. /* applies unicode Normalization Form C */
  137. {
  138. unsigned int *p = xs_unicode_nfd_table;
  139. unsigned int *e = xs_unicode_nfd_table +
  140. sizeof(xs_unicode_nfd_table) / sizeof(unsigned int);
  141. while (p < e) {
  142. if (p[1] == base && p[2] == diac) {
  143. *cpoint = p[0];
  144. return 1;
  145. }
  146. p += 3;
  147. }
  148. return 0;
  149. }
  150. #endif /* _XS_UNICODE_TBL_H */
  151. #endif /* XS_IMPLEMENTATION */
  152. #endif /* _XS_UNICODE_H */