Source code belongs to this explanation.


/* , Show UTF8 with context.
   Author: Ruud Harmsen

   Read Unicode text, and display it in such a way that it is
   easy to decipher what is in it exactly, even if the fonts
   are not all there or don't work perfectly.

   17 October 2016: Made the program more robust, so it no
   longer crashes on invalid UTF8 input, such as ISO8859-1.
   It now neatly indicates those with dashes (--------) where
   normally the Unicode scalar is shown.

   11 April 2020: Fixed a bug which caused a crash on reading
   an empty file, or pressing ctrl-D immediately on stdin.

   11 April 2020: Known bug, no fix available: if a UTF-8 code
   consisting of 2, 3, or 4 bytes extends over the boundary
   of the inputbuffer, its follow-up bytes will be reported
   separately as invalid UTF8. The text itself, and the next
   character are also correctly reported again, before and
   after, so nothing gets lost. The effect is most noticeable
   if the buffer is small, as it was when I was fixing the
   previous bug and testing the fix for that one.

   20 April 2020: For a much improved version see utfcntxt.c
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "../cgi-src/utftools/utftools.h"

#define BYTESINBUF    500
#define CONTEXTTOSHOW  40

static unsigned char inputbuffer[BYTESINBUF + 1];
static unsigned char *utf8charsinbuf[BYTESINBUF + 1];
static unsigned char **toshownow = NULL;
/* buffilleduntil can point to the start of incomplete UTF8! */
static unsigned char **buffilleduntil = NULL;
/* Points to 1st byte in buffer which does not contain a read char */
static unsigned char *firstnotread = NULL;
static long totalbytesread = 0;

static size_t getmore (FILE *fp);
static int shiftbufs (void);
static int mapchar2utf (size_t charsread);

int main (int argc, char **argv)
{
   FILE *fpi = stdin;
   size_t nread;
   size_t tohandle;

   if (argc > 1)
   {
      fpi = fopen(argv[1], "rb");
   }
   if (!fpi)
      fpi = stdin;

   while (getmore(fpi))
   {
      unsigned char **cpp;

      printf("%06ld 0x%02x-%02x-%02x-%02x ",
         totalbytesread - (firstnotread - *toshownow),
         **toshownow,
         (*toshownow)[1] ? (*toshownow)[1] : 0,
         (*toshownow)[2] ? (*toshownow)[2] : 0,
         (*toshownow)[3] ? (*toshownow)[3] : 0);

      if (isascii(**toshownow) || utf8valid(*toshownow))
      {
         printf("0x%06lx: ", utf2scalar(*toshownow));
      }
      else
      {
         /* Invalid UTF8 input */
         printf("--------: ");
      }

      for (cpp = toshownow;
           cpp < toshownow + CONTEXTTOSHOW && *cpp < firstnotread;
           cpp++)
      {
         if (isascii(**cpp) && !iscntrl(**cpp))
            printf("%c", **cpp);
         else
            printf("%c", '.');
      }
      printf("\n");
      toshownow++;
   }

   return 0;
}

static size_t getmore (FILE *fp)
{
   static int EOFreached = 0;

   if (!toshownow || !buffilleduntil)
   {
      toshownow = buffilleduntil = utf8charsinbuf;
      *toshownow = inputbuffer;
      firstnotread = inputbuffer;
   }
   if (buffilleduntil - toshownow < CONTEXTTOSHOW)
   {
      size_t charsread, wanttoread;

      if (toshownow > utf8charsinbuf)
      {
         shiftbufs();
      }

      /* Refill at the right of the buffer */
      wanttoread = inputbuffer + BYTESINBUF - firstnotread;
      if (EOFreached)
      {
         if (*toshownow < firstnotread)
            return 1;
         else
            return 0;
      }
      /* Clear unused part of buffer, for safety */
      memset(firstnotread, '\0',
         (sizeof inputbuffer) - (firstnotread - inputbuffer));

      charsread = fread(firstnotread, 1, wanttoread, fp);
      if (charsread < wanttoread)
      {
         /* This test is necessary for the special case of reading
            an empty file, or immediately pressing ctrl-d on
            standard input (stdin). */
         if (charsread == 0 && toshownow == buffilleduntil)
            return 0;

         /* Remember for next time; first handle what is still in the buffer */
         EOFreached = 1;
      }

      mapchar2utf(charsread);
      firstnotread   += charsread;
      totalbytesread += charsread;
   }

   return 1;
}

static int shiftbufs (void)
{
   size_t bytestoshift    = firstnotread - *toshownow;
   size_t byteshiftdist   = *toshownow - inputbuffer;
   size_t utfcellstoshift = buffilleduntil - toshownow;
   size_t utfshiftdist    = toshownow - utf8charsinbuf;
   unsigned char **cpp;

   /* Shift already handled stuff away to the left */
   memmove(inputbuffer, *toshownow, bytestoshift + 1);
   firstnotread    -= byteshiftdist;

   memmove(utf8charsinbuf, toshownow,
          (utfcellstoshift + 1) * (sizeof utf8charsinbuf[0]));

   for (cpp = utf8charsinbuf;
        cpp <= utf8charsinbuf + utfcellstoshift;
        cpp++)
   {
      *cpp -= byteshiftdist;
   }
   toshownow      -= utfshiftdist;
   buffilleduntil -= utfshiftdist;

   return 0;
}

static int mapchar2utf (size_t charsread)
{
   unsigned char *cp;
   unsigned char *until;

   until = firstnotread + charsread;

   for (cp = *buffilleduntil;
        cp <= until;
        cp += utf8valid(cp) ? utf8len(*cp) : 1, buffilleduntil++)
   {
      *buffilleduntil = cp;
   }
   buffilleduntil--;

   return 0;
}