Source code belongs to this explanation.
/* , Show UTF8 with context. Author: Ruud Harmsen Read Unicode text, and display it in such a way that it is easy to decipher what is in it exactly, even if the fonts are not all there or don't work perfectly. 17 October 2016: Made the program more robust, so it no longer crashes on invalid UTF8 input, such as ISO8859-1. It now neatly indicates those with dashes (--------) where normally the Unicode scalar is shown. 11 April 2020: Fixed a bug which caused a crash on reading an empty file, or pressing ctrl-D immediately on stdin. 11 April 2020: Known bug, no fix available: if a UTF-8 code consisting of 2, 3, or 4 bytes extends over the boundary of the inputbuffer, its follow-up bytes will be reported separately as invalid UTF8. The text itself, and the next character are also correctly reported again, before and after, so nothing gets lost. The effect is most noticeable if the buffer is small, as it was when I was fixing the previous bug and testing the fix for that one. 20 April 2020: For a much improved version see utfcntxt.c */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include "../cgi-src/utftools/utftools.h" #define BYTESINBUF 500 #define CONTEXTTOSHOW 40 static unsigned char inputbuffer[BYTESINBUF + 1]; static unsigned char *utf8charsinbuf[BYTESINBUF + 1]; static unsigned char **toshownow = NULL; /* buffilleduntil can point to the start of incomplete UTF8! */ static unsigned char **buffilleduntil = NULL; /* Points to 1st byte in buffer which does not contain a read char */ static unsigned char *firstnotread = NULL; static long totalbytesread = 0; static size_t getmore (FILE *fp); static int shiftbufs (void); static int mapchar2utf (size_t charsread); int main (int argc, char **argv) { FILE *fpi = stdin; size_t nread; size_t tohandle; if (argc > 1) { fpi = fopen(argv[1], "rb"); } if (!fpi) fpi = stdin; while (getmore(fpi)) { unsigned char **cpp; printf("%06ld 0x%02x-%02x-%02x-%02x ", totalbytesread - (firstnotread - *toshownow), **toshownow, (*toshownow)[1] ? (*toshownow)[1] : 0, (*toshownow)[2] ? (*toshownow)[2] : 0, (*toshownow)[3] ? (*toshownow)[3] : 0); if (isascii(**toshownow) || utf8valid(*toshownow)) { printf("0x%06lx: ", utf2scalar(*toshownow)); } else { /* Invalid UTF8 input */ printf("--------: "); } for (cpp = toshownow; cpp < toshownow + CONTEXTTOSHOW && *cpp < firstnotread; cpp++) { if (isascii(**cpp) && !iscntrl(**cpp)) printf("%c", **cpp); else printf("%c", '.'); } printf("\n"); toshownow++; } return 0; } static size_t getmore (FILE *fp) { static int EOFreached = 0; if (!toshownow || !buffilleduntil) { toshownow = buffilleduntil = utf8charsinbuf; *toshownow = inputbuffer; firstnotread = inputbuffer; } if (buffilleduntil - toshownow < CONTEXTTOSHOW) { size_t charsread, wanttoread; if (toshownow > utf8charsinbuf) { shiftbufs(); } /* Refill at the right of the buffer */ wanttoread = inputbuffer + BYTESINBUF - firstnotread; if (EOFreached) { if (*toshownow < firstnotread) return 1; else return 0; } /* Clear unused part of buffer, for safety */ memset(firstnotread, '\0', (sizeof inputbuffer) - (firstnotread - inputbuffer)); charsread = fread(firstnotread, 1, wanttoread, fp); if (charsread < wanttoread) { /* This test is necessary for the special case of reading an empty file, or immediately pressing ctrl-d on standard input (stdin). */ if (charsread == 0 && toshownow == buffilleduntil) return 0; /* Remember for next time; first handle what is still in the buffer */ EOFreached = 1; } mapchar2utf(charsread); firstnotread += charsread; totalbytesread += charsread; } return 1; } static int shiftbufs (void) { size_t bytestoshift = firstnotread - *toshownow; size_t byteshiftdist = *toshownow - inputbuffer; size_t utfcellstoshift = buffilleduntil - toshownow; size_t utfshiftdist = toshownow - utf8charsinbuf; unsigned char **cpp; /* Shift already handled stuff away to the left */ memmove(inputbuffer, *toshownow, bytestoshift + 1); firstnotread -= byteshiftdist; memmove(utf8charsinbuf, toshownow, (utfcellstoshift + 1) * (sizeof utf8charsinbuf[0])); for (cpp = utf8charsinbuf; cpp <= utf8charsinbuf + utfcellstoshift; cpp++) { *cpp -= byteshiftdist; } toshownow -= utfshiftdist; buffilleduntil -= utfshiftdist; return 0; } static int mapchar2utf (size_t charsread) { unsigned char *cp; unsigned char *until; until = firstnotread + charsread; for (cp = *buffilleduntil; cp <= until; cp += utf8valid(cp) ? utf8len(*cp) : 1, buffilleduntil++) { *buffilleduntil = cp; } buffilleduntil--; return 0; }