Source code belongs to this explanation.


/* , by Ruud Harmsen.
   Improved implementation of my 2008 utf8cntx.c,
   producing the exact same output, minus the bug.

   Idea: do only minimal buffering yourself, instead
   rely on the stdio library's built-in buffering,
   repositioning as needed using functions ftell()
   and fseek().
   As a consequence, the input can no longer come
   from stdin or a pipe, it has to be a disk file.

   Also, missing bytes at the end (if the number of
   bytes in the file is not a multiple of four) are
   now displayed as "..", not "00", because they are
   not.

   The new version is somewhat slower than the old,
   but as the whole idea only makes sense for smallish
   files (because the output is intended for human
   examination), that is no issue.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "../cgi-src/utftools/utftools.h"

/* MAXUTFLEN can only be 4. Or UTF16 would be broken.
   And no more will ever be needed. */
#define MAXUTFLEN 4
unsigned char bytesbuf[MAXUTFLEN];
size_t bytesinbuf = 0;
#define CONTEXTLEN 40
long posinfile = 0;

static size_t getmore       (FILE *fpi);
static void display         (FILE *fpi, FILE *fpo);
static void display_context (FILE *fpi, FILE *fpo);

int main (int argc, char **argv)
{
   FILE *fpi = NULL;
   FILE *fpo = stdout;

   if (argc != 2)
   {
      fprintf(stderr, "Usage: %s inputfile\n", argv[0]);
      exit(1);
   }
   else
   {
      fpi = fopen(argv[1], "rb");
      if (!fpi)
      {
         fprintf(stderr, "%s: Cannot open inputfile %s\n", argv[0], argv[1]);
         exit(2);
      }
   }

   while (getmore(fpi))
   {
      display(fpi, fpo);
   }

   fclose(fpi);

   return 0;
}

static size_t getmore (FILE *fpi)
{
   size_t red;

   posinfile = ftell(fpi);
   red = fread(bytesbuf, 1, MAXUTFLEN, fpi);
   bytesinbuf = red;

   return red;
}

static void display (FILE *fpi, FILE *fpo)
{
   int lenhandled;
   int i;

   fprintf(fpo, "%06ld 0x", posinfile);

   for (i = 0; i < MAXUTFLEN; i++)
      if (bytesinbuf > i)
         fprintf(fpo, i < MAXUTFLEN -1 ? "%02x-" : "%02x ", bytesbuf[i]);
      else
         fprintf(fpo, i < MAXUTFLEN -1 ? "..-" : ".. ");

   if (isascii(bytesbuf[0]) || utf8valid(bytesbuf))
      fprintf(fpo, "0x%06lx: ", utf2scalar(bytesbuf));
   else
      /* Invalid UTF8 input */
      fprintf(fpo, "--------: ");

   lenhandled =  utf8valid(bytesbuf) ? utf8len(bytesbuf[0]) : 1;
   if (lenhandled == 0 || lenhandled > 4)
      lenhandled = 1;

   fseek(fpi, posinfile, SEEK_SET);
   display_context(fpi, fpo);

   posinfile  += lenhandled;
   fseek(fpi, posinfile, SEEK_SET);
}

static void display_context (FILE *fpi, FILE *fpo)
{
   int i;
   unsigned char contextbuf[MAXUTFLEN];
   long contextpos = posinfile;
   size_t red;
   int lenhandled;

   for (i = 0;
        i < CONTEXTLEN &&
        (red = fread(contextbuf, 1, MAXUTFLEN, fpi)) > 0;
        i++)
   {
      if (isascii(contextbuf[0]) && !iscntrl(contextbuf[0]))
         fprintf(stdout, "%c", contextbuf[0]);
      else
         fprintf(stdout, "%c", '.');

      lenhandled =  utf8valid(contextbuf) ? utf8len(contextbuf[0]) : 1;
      if (lenhandled == 0 || lenhandled > 4)
         lenhandled = 1;
      if (lenhandled > red)
         lenhandled = red;
      contextpos += lenhandled;

      fseek(fpi, contextpos, SEEK_SET);
   }
   fprintf(fpo, "\n");

   fseek(fpi, posinfile, SEEK_SET);
}