utftools.h

Publication 6 May 2016, written 15 December 2008 or before. Updated 28 September 2019
Source code


/* Copyright 2008 by R. Harmsen.
   But I won't sue anyone who uses or adapts the code.

   28 September 2019: Added function CP1252_to_scalar()
 */

/* 6 February 2020: see also these functions from the standard
   C-library: mblen, mbrlen, mbtowc, mbstowcs, wcstombs, wctomb,
   etc. These do much the same as my functions, plus, they take the
   locale into account.

   See man pages, especially
   man 7 locale
   man 3 setlocale

   See also https://www.openbsd.org/papers/eurobsdcon2016-utf8.pdf
 */

/* 20 February 2020: added function utf8frst */

/***********************************************************************
   This function examines the first byte of a UTF8 sequence,
   and returns that sequence's length (including the first byte
   itself), based on the value of that first byte. This is a
   theoretical value, so the outcome doesn't say anything as
   to whether enough valid UTF8 bytes actually follow, nor as
   to whether any bytes follow at all.

   Return values:
   1 A seven bits ASCII value
   0 A UTF-8 follow-up byte
   2 through 6: a theoretical UTF-8 start byte, with that number
                of 1-bits and the start.
   1 Anything else, including hex FE of FF.
 **********************************************************************/
int utf8len (int firstbyte);


/***********************************************************************
   This function utf8frst is more reliable than utf8len above, in that
   it only reports the length of a potential UTF-8 byte sequence, if
   the start byte is valid (which doesn't mean that all follow-up bytes
   are valid too; those are not checked in this function, which looks
   at only one byte).

   Invalid first bytes are C0 and C1, because that would enable two-byte
   aliases for plain ASCII, and F5 and above, because they would encode
   scalar in ranges that are not allowed as Unicode.

   See http://www.unicode.org/L2/L2000/00374r2-short-utf8.htm and
       http://unicode.org/versions/corrigendum1.html
       "Table 3.1B. Legal UTF-8 Byte Sequences", which states:

   Code Points        1st Byte 2nd Byte 3rd Byte 4th Byte
   U+0000  ..U+007F   00..7F
   U+0080  ..U+07FF   C2..DF   80..BF
   U+0800  ..U+0FFF   E0       A0..BF   80..BF
   U+1000  ..U+FFFF   E1..EF   80..BF   80..BF
   U+10000 ..U+3FFFF  F0       90..BF   80..BF   80..BF
   U+40000 ..U+FFFFF  F1..F3   80..BF   80..BF 	 80..BF
   U+100000..U+10FFFF F4       80..8F   80..BF   80..BF

   Return values:
   1 A seven bits ASCII value
   0 A UTF-8 follow-up byte (validity in context is not checked)
   2 through 4: a valid UTF-8 start byte, but a byte sequence
     containing that number of bytes in total.
  -1 Anything else, including hex FE of FF.
 **********************************************************************/
int utf8frst (int firstbyte);


/***********************************************************************
   This function checks if buf points to a sequence of at least two
   valid UTF8 bytes which represent a single Unicode
   scalar value. The function then returns non-zero, else zero.

   Note: the buffer MUST be at least seven bytes long!

   Note that simple Ascii and UTF8 follow-up chars are also
   considered non-valid for this function. Only the start of
   a UTF8 sequence, including its follow-ups, is considered
   valid.

   The character AFTER a valid UTF8, in a stream of UTF8 text,
   could be either ASCII (including a null byte) or the start
   of a new UTF8 char. It cannot be a UTF8 follow-up byte.
   However, because we want to be able to test also mixed text,
   possibly containing UTF8 and ISO-8859, this is not tested here!
   This is intended behaviour!
 **********************************************************************/
int utf8valid (unsigned char *start);


/***********************************************************************
   This function expects that `start' points to a sequence of valid
   UTF8 bytes which represent a single Unicode
   scalar value. It does not itself check validity.

   Note: the buffer MUST be at least seven bytes long!

   Valid Utf8 in this case does include plain ASCII (7 bits values).
   This differs from the assumption by function utf8valid.

   The return value is the Unicode scalar value that the UTF8 sequence
   represents.
 **********************************************************************/
long utf2scalar (unsigned char *start);


/***********************************************************************
   Returns a pointer to a null-terminated byte string in a static buffer.
   Scalar is a Unicode scalarvalue.
   The buffer will contain the representation in UTF8.
 **********************************************************************/
unsigned char *scalar2utf8 (long scalar);


/***********************************************************************
   This function tests if a Unicode scalar value can be represented as
   a valid ISO-8859-1 code. This means it must be ASCII (7 bits, less than
   0x80), or be between 0xa0 and 0xff (both including).

   Returns non-zero for valid, zero means invalid.
 **********************************************************************/
int isconvertibleISO8859_1 (long scalar);


/***********************************************************************
   This function is like isconvertibleISO8859_1 except that it also
   accepts scalar Unicode values that map to the Windows 1252 byte codes
   in the range 0x80 thru 0x9f.

   If p_converted is NULL, this argument is ignored, but if a valid
   pointer is passed, the location it points to will be set to the
   corresponding Windows 1252 value (or -1 if invalid).
 **********************************************************************/
int isconvertibleCP1252 (long scalar, int *p_converted);


/***********************************************************************
   This function expects a Unicode scalar that maps to a valid
   Windows 1252 code in the range 0x80 thru 0x9f. It then returns
   that value, else -1.

   Examples:
   Unicode scalar value hex 2026 is the ellipis (three dots).
   This maps to code position hex 85 in Windows code page 1252.

   Unicode scalar value hex 201d is the closing curly double quote
   (99 shaped, top of the line). This maps to code position
   hex 94 in Windows code page 1252.
 **********************************************************************/
int convert2CP1252 (long scalar);

/***********************************************************************
   This function expects a CP1252 code byte, and returns its Unicode
   scalar, or -1 if unconvertible.

   If CP1252 is also ASCII, or in the valid range of ISO-8859-1, the
   function returns its input. Otherwise, the scalar is looked up
   in a table.
 **********************************************************************/
long CP1252_to_scalar (int CP1252);