utf8mixd.c

Publication , written 15 December 2008 or before. Updated 28 September 2019

Sources belong to this explanation.

Source code



/* Copyright 2008 by R. Harmsen.
   But I won't sue anyone who uses or adapts the code.

   28 September 2019: Added option -u to make the output all UTF8.

   20 February 2020: Fixed a bug which caused the characters ÿ and þ
   to be incorrectly converted from ISO-8859-1/CP1252 to UTF-8. That
   is, they were not converted at all, just copied as is.
   See also an added function in utftools.h/utftools.c .
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "../cgi-src/utftools/utftools.h"

static void Usage (void)
{
   char text[] =
      "This is a simple, specialised UTF8 to Latin1 converter.\n"
      "Unlike general purpose converters like uniconv, it can\n"
      "reliably convert textfiles in languages like German, French\n"
      "Spanish etc. in which part of the text is in ISO-8859-1\n"
      "(Latin1) and part in UTF8. It does this by detecting what\n"
      "is probably UTF8 and what is certainly not."
      "\n"
      "Optionally (-w), it also support Windows code page\n"
      "1252, instead of plain ISO8859-1.\n\n"
      "Optionally (-u), the output is UTF8, not ISO8859-1 or Windows 1252.\n"
      "\n"
      "Usage:\tutf8mixd [-w] infile outfile\n"
      "or:\tutf8mixd [-u] infile outfile\n\n";

   fprintf(stderr, "%s", text);
}

enum dowhats
{
   wantISO8859_1 = 1,
   wantCP1252    = 2,
   wantUTF8      = 3,
};

static int Convert (int c, FILE *fi, FILE *fo, enum dowhats dowhat);
static int Convert2UTF8 (int c, FILE *fi, FILE *fo);

int main (int argc, char **argv)
{
   FILE *fi, *fo;
   int c;
   enum dowhats dowhat;

   dowhat = wantISO8859_1;

   if (argc > 3)
   {
      if (argv[1][0] == '-' && argv[1][1] == 'w')
      {
         dowhat = wantCP1252;
         argc--, argv++;
      }
      else if (argv[1][0] == '-' && argv[1][1] == 'u')
      {
         dowhat = wantUTF8;
         argc--, argv++;
      }
   }

   if (argc != 3)
   {
      Usage();
      exit(1);
   }
   fi = fopen(argv[1], "rb");
   if (!fi)
   {
      fprintf(stderr, "Cannot open input file %s\n\n", argv[1] ? argv[1] : "null");
      Usage();
      exit(2);
   }
   fo = fopen(argv[2], "wb");
   if (!fo)
   {
      fprintf(stderr, "Cannot open output file %s\n\n", argv[2] ? argv[2] : "null");
      Usage();
      exit(3);
   }

   if (dowhat != wantUTF8)
   {
      while ((c = getc(fi)) != EOF)
      {
         if (!Convert(c, fi, fo, dowhat))
            putc(c, fo);
      }
   }
   else
   {
      while ((c = getc(fi)) != EOF)
      {
         if (!Convert2UTF8(c, fi, fo))
            putc(c, fo);
      }
   }

   fclose(fi);
   fclose(fo);

   return 0;
}

static int Convert (int c, FILE *fi, FILE *fo, enum dowhats dowhat)
{
   int j, cc, len;
   int converted = 0;
   unsigned char buf[10];
   long pos;

   pos = ftell(fi);

   memset(buf, '\0', sizeof buf);
   buf[0] = c;

   len = utf8len(c);
   for (j = 1; j < len && j < (sizeof buf - 1); j++)
   {
      cc = getc(fi);
      if (cc == EOF)
         break;
      buf[j] = cc;
   }

   if (len > 1 && utf8valid(buf))
   {
      long scalar;
      int toput;

      scalar = utf2scalar(buf);

      if (dowhat == wantISO8859_1 && isconvertibleISO8859_1(scalar))
      {
         putc(scalar, fo);
         converted = 1;
      }
      else if (dowhat == wantCP1252 && isconvertibleCP1252(scalar, &toput))
      {
         if (isconvertibleISO8859_1(scalar))
            putc(scalar, fo);
         else
            putc(toput, fo);

         converted = 1;
      }
   }

   if (!converted)
   {
      fseek(fi, pos, SEEK_SET);
   }

   return converted;
}

static int Convert2UTF8 (int c, FILE *fi, FILE *fo)
{
   int j, cc, len;
   int converted = 0;
   unsigned char buf[10];
   long pos;

   pos = ftell(fi);

   memset(buf, '\0', sizeof buf);
   buf[0] = c;

   len = utf8frst(c);

   if (len == 1)
   {
      putc(c, fo);
      return converted = 1;
   }

   for (j = 1; j < len && j < (sizeof buf - 1); j++)
   {
      cc = getc(fi);
      if (cc == EOF)
         break;
      buf[j] = cc;
   }

   if (len > 1 && utf8valid(buf))
   {
      fwrite(buf, len, 1, fo);
      converted = 1;
   }
   else if (isconvertibleISO8859_1(c))
   {
      fwrite(scalar2utf8(c), 2, 1, fo);
      fseek(fi, pos, SEEK_SET);
      converted = 1;
   }
   else /* Could only be CP1252 then */
   {
      long scalar = CP1252_to_scalar(c);
      char *b;

      if (scalar > 0)
      {
         b = (char *)scalar2utf8(scalar);
         fwrite(b, strlen(b), 1, fo);
         fseek(fi, pos, SEEK_SET);
         converted = 1;
      }
   }

   if (!converted)
   {
      fseek(fi, pos, SEEK_SET);
   }

   return converted;
}