Publication , written 15 December 2008 or before. Updated 28 September 2019, and February 2020.
Sources belong to this explanation.
/* Copyright 2008 by R. Harmsen. But I won't sue anyone who uses or adapts the code. 28 September 2019: Added function CP1252_to_scalar() */ #include <string.h> /* Usage instructions of each function are in the following include file */ #include "utftools.h" /****************************************************************/ int utf8len (int firstbyte) { if ((firstbyte & ~0x7f) == 0) return 1; if ((firstbyte & ~0x3f) == 0x80) return 0; /* Not the start of UTF8, but a follow-up byte */ if ((firstbyte & ~0x1f) == 0xc0) return 2; if ((firstbyte & ~0x0f) == 0xe0) return 3; if ((firstbyte & ~0x07) == 0xf0) return 4; if ((firstbyte & ~0x03) == 0xf8) return 5; if ((firstbyte & ~0x01) == 0xFC) return 6; /* All possible 8-bit byte values containing at least one zero bit have been covered above. So we could only get here in case of 0xff. We deem that to be length 1 as well. */ return 1; } /****************************************************************/ int utf8frst (int firstbyte) { if ((firstbyte & ~0x7f) == 0) return 1; if ((firstbyte & ~0x3f) == 0x80) return 0; /* Not the start of UTF8, but a follow-up byte */ if ((firstbyte & ~0x1f) == 0xc0) { /* 7 bits/1 byte ASCII could be encoded in 2 bytes (for 11 bits) too, but this is forbidden to avert spoofing. So bits 5 to 1 (assuming bit 0 is rightmost) must no be all zero. */ if ((firstbyte & 0x3e) == 0) return -1; return 2; } if ((firstbyte & ~0x0f) == 0xe0) { return 3; } if ((firstbyte & ~0x07) == 0xf0) { /* See http://www.unicode.org/L2/L2000/00374r2-short-utf8.htm and http://unicode.org/versions/corrigendum1.html "Table 3.1B. Legal UTF-8 Byte Sequences" */ if (firstbyte > 0xf4) return -1; return 4; } /* Unicode currently only supports scalar of 21 bits maximum, encoded in 4 bytes. Everything else is, and probably will be, for ever, unsupported and therefore wrong. */ if ((firstbyte & ~0x03) == 0xf8 || (firstbyte & ~0x01) == 0xFC || (firstbyte & ~0x00) == 0xFF) return -1; return -1; } /****************************************************************/ int utf8valid (unsigned char *buf) { int i, max; max = utf8len(buf[0]); if (max > 6 || max <= 1) return 0; /* Note that simple Ascii and UTF8 follow-up chars are also considered non-valid. Only the start of a UTF8 sequence, including its follow-ups, is considered valid (and returns 1) */ for (i = 1; i < max; i++) { if ((buf[i] & ~0x3f) != 0x80) { /* Not a UTF8 follow-up character, so the UTF8 sequence isn't valid */ return 0; } } /* The character AFTER a valid UTF8, in a stream of UTF8 text, could be either ASCII (including a null byte) or the start of a new UTF8 char. It cannot be a UTF8 follow-up byte. However, because we want to be able to test also mixed text, possibly containing UTF8 and ISO-8859, this is not tested here! So this is intended behaviour! */ /* No invalidity has been detected so far, so we assume a valid UTF8 sequence. */ return 1; } /****************************************************************/ long utf2scalar (unsigned char *start) { switch (utf8len(*start)) { default: case 1: return *start; case 2: if (start[1]) { return (((unsigned long)start[0] & 0x1f) << 6) | ((unsigned long)start[1] & 0x3f); } case 3: if (start[1] && start[2]) { return (((unsigned long)start[0] & 0x0f) << 12) | (((unsigned long)start[1] & 0x3f) << 6) | ((unsigned long)start[2] & 0x3f); } case 4: if (start[1] && start[2] && start[3]) { return (((unsigned long)start[0] & 0x07) << 18) | (((unsigned long)start[1] & 0x3f) << 12) | (((unsigned long)start[2] & 0x3f) << 6) | ((unsigned long)start[3] & 0x3f); } case 5: if (start[1] && start[2] && start[3] && start[4]) { return (((unsigned long)start[0] & 0x03) << 24) | (((unsigned long)start[1] & 0x3f) << 18) | (((unsigned long)start[2] & 0x3f) << 12) | (((unsigned long)start[3] & 0x3f) << 6) | ((unsigned long)start[4] & 0x3f); } case 6: if (start[1] && start[2] && start[3] && start[4] && start[5]) { return (((unsigned long)start[0] & 0x03) << 30) | (((unsigned long)start[1] & 0x3f) << 24) | (((unsigned long)start[2] & 0x3f) << 18) | (((unsigned long)start[3] & 0x3f) << 12) | (((unsigned long)start[4] & 0x3f) << 6) | ((unsigned long)start[5] & 0x3f); } } return *start; } /****************************************************************/ unsigned char *scalar2utf8 (long scalar) { static unsigned char utf[7]; memset(utf, '\0', sizeof utf); if (scalar <= 0x7f) /* 7 bits */ { utf[0] = scalar & 0x7f; } else if (scalar <= 0x7ffL) /* 11 bits */ { utf[0] = 0xc0 | ((scalar >> 6) & 0x1f); utf[1] = 0x80 | ( scalar & 0x3f); } else if (scalar <= 0xffffL) /* 16 bits */ { utf[0] = 0xe0 | ((scalar >> 12) & 0x0f); utf[1] = 0x80 | ((scalar >> 6) & 0x3f); utf[2] = 0x80 | ( scalar & 0x3f); } else if (scalar <= 0x1fffffL) /* 21 bits */ { utf[0] = 0xf0 | ((scalar >> 18) & 0x07); utf[1] = 0x80 | ((scalar >> 12) & 0x3f); utf[2] = 0x80 | ((scalar >> 6) & 0x3f); utf[3] = 0x80 | ( scalar & 0x3f); } else if (scalar <= 0x3ffffffL) /* 26 bits */ { utf[0] = 0xf8 | ((scalar >> 24) & 0x03); utf[1] = 0x80 | ((scalar >> 18) & 0x3f); utf[2] = 0x80 | ((scalar >> 12) & 0x3f); utf[3] = 0x80 | ((scalar >> 6) & 0x3f); utf[4] = 0x80 | ( scalar & 0x3f); } else if (scalar <= 0x7fffffffL) /* 31 bits */ { utf[0] = 0xfc | ((scalar >> 30) & 0x01); utf[1] = 0x80 | ((scalar >> 24) & 0x3f); utf[2] = 0x80 | ((scalar >> 18) & 0x3f); utf[3] = 0x80 | ((scalar >> 12) & 0x3f); utf[4] = 0x80 | ((scalar >> 6) & 0x3f); utf[5] = 0x80 | ( scalar & 0x3f); } return utf; } /****************************************************************/ int isconvertibleISO8859_1 (long scalar) { if (scalar < 0x80) return 1; if (scalar >= 0xa0 && scalar <= 0xff) return 1; return 0; } /****************************************************************/ int isconvertibleCP1252 (long scalar, int *p_converted) { int canbe = 0; long converted; if (isconvertibleISO8859_1(scalar)) { converted = scalar; canbe = 1; } else { converted = convert2CP1252(scalar); if (converted != -1) { canbe = 1; } } if (canbe) { /* Optional return of conversion value, only if a valid pointer was passed in the call to the function. */ if (p_converted) *p_converted = converted; } return canbe; } /****************************************************************/ static long CP1252_conversiontable[] = { 0x20ac, /* 0x80 */ -1, /* 0x81 */ 0x201a, /* 0x82 */ 0x0192, /* 0x83 */ 0x201e, /* 0x84 */ 0x2026, /* 0x85 */ 0x2020, /* 0x86 */ 0x2021, /* 0x87 */ 0x02c6, /* 0x88 */ 0x2030, /* 0x89 */ 0x0160, /* 0x8a */ 0x2039, /* 0x8b */ 0x0152, /* 0x8c */ -1, /* 0x8d */ 0x017d, /* 0x8e */ -1, /* 0x8f */ -1, /* 0x90 */ 0x2018, /* 0x91 */ 0x2019, /* 0x92 */ 0x201c, /* 0x93 */ 0x201d, /* 0x94 */ 0x2022, /* 0x95 */ 0x2013, /* 0x96 */ 0x2014, /* 0x97 */ 0x02dc, /* 0x98 */ 0x2122, /* 0x99 */ 0x0161, /* 0x9a */ 0x203a, /* 0x9b */ 0x0153, /* 0x9c */ -1, /* 0x9d */ 0x0017e, /* 0x9e */ 0x00178, /* 0x9f */ }; #define numelt_CP1252_conversiontable \ (sizeof CP1252_conversiontable) / \ (sizeof CP1252_conversiontable[0]) /****************************************************************/ int convert2CP1252 (long scalar) { int i; if (scalar == -1) return -1; /* Linear search, could have used lsearch but this is probably almost as fast, and simpler. */ for (i = 0; i < numelt_CP1252_conversiontable; i++) { if (CP1252_conversiontable[i] == scalar) return i + 0x80; } return -1; } /****************************************************************/ long CP1252_to_scalar (int CP1252) { int i; if (isconvertibleISO8859_1(CP1252)) return CP1252; i = CP1252 - 0x80; if (i > sizeof CP1252_conversiontable / sizeof CP1252_conversiontable[0]) return -1; else return CP1252_conversiontable[i]; } /****************************************************************/ #ifdef Main #include <stdio.h> int main (int argc, char argv[]) { long l, after; unsigned char *utf; for (l = 0x80; l < 100000000L; l++) { utf = scalar2utf8(l); if (!utf8valid(utf)) { printf("Invalid UTF at %lx\n", l); } else if ((after = utf2scalar(utf)) != l) { printf("%ld\n", l); printf("%ld\n", after); printf("%ld appears as %ld\n", l, after); } } return 0; } #endif