Interlingua in Georgian script

/* Le 23 usque 26 de april 2020. Converter un texto in interlingua
   del scriptura latin al scriptura georgian.

   Autor: Ruud Harmsen, 
   	  https://rudhar.com/lingtics/intrlnga/scrptura/
   Demo:  https://rudhar.com/religion/soantaka.htm
  */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "utftools.h"

long conv_table[] =
{
/* See:
   https://en.wikipedia.org/wiki/Georgian_language#Phonology
   https://en.wikipedia.org/wiki/Georgian_scripts
   http://unicode.org/charts/PDF/U10A0.pdf */

   /* a */ 0x10D0,
   /* b */ 0x10D1,
   /* c */ 0x10EC, /* Note 1, see comments below */
   /* d */ 0x10D3,
   /* e */ 0x10D4,
   /* f */ 0x10E4, /* Note 2, see comments below */
   /* g */ 0x10D2,
   /* h */ 0x10F0,
   /* i */ 0x10D8,
   /* j */ 0x10DF, /* Based on sound of j in French Jean */
   /* k */ 0x10D9, /* Note 1 */
   /* l */ 0x10DA,
   /* m */ 0x10DB,
   /* n */ 0x10DC,
   /* o */ 0x10DD,
   /* p */ 0x10DE, /* Note 1 */
   /* q */ 0x10E7,
   /* r */ 0x10E0,
   /* s */ 0x10E1,
   /* t */ 0x10E2, /* Note 1 */
   /* u */ 0x10E3,
   /* v */ 0x10D5,
   /* w */ 0x10F3, /* Wikipedia:   U+10F3 vie [...] Svan /w/,
                      Unicode.org: 10F3 ჳ GEORGIAN LETTER WE
                      For Georgian itself, this is an archaic letter. */
   /* x */ 0x10EE, /* Based on sound as in German Bach, not
                      its unusual sound ks in Interlingua */
   /* y */ 0x10F2, /* Wikipedia: ჲ (hie), also called yota,[59] appeared instead [...] */
   /* z */ 0x10D6,
   /* safety stop */ -1,
};

static int convert (int c, FILE *fpi, FILE *fpo,
   long offset_lowercase, long offset_uppercase);
static void interpret_cmdline_options (int argc, char **argv,
    long *p_offset_lowercase, long *p_offset_uppercase);

int  main (int argc, char **argv)
{
   long offset_lowercase = 0;
   long offset_uppercase = 0;
   FILE *fpi = stdin, *fpo = stdout;
   int c;
   int intag = 0, inentity = 0;

   interpret_cmdline_options(argc, argv, &offset_lowercase, &offset_uppercase);

   while ((c = getc(fpi)) != EOF)
   {
      if (!intag && c == '<')
         intag = 1;
      else if (intag && c == '>')
         intag = 0;
      else if (!inentity && c == '&')
         inentity = 1;
      else if (inentity && c == ';')
         inentity = 0;

      if (intag || inentity)
         putc(c, fpo);
      else
         convert(c, fpi, fpo, offset_lowercase, offset_uppercase);
   }

   return 0;
}

static int convert (int c, FILE *fpi, FILE *fpo,
   long offset_lowercase, long offset_uppercase)
{
   long offset = 0;

   if (!isascii(c) || !isalpha(c))
   {
      putc(c, fpo);
   }
   else
   {
      int tabval, index, less_for_uppercase = 0;

      if (isupper(c))
      {
         index = c - 'A';
         offset = offset_uppercase;
      }
      else
      {
         index = c - 'a';
         offset = offset_lowercase;
      }

      /* Safety first */
      if (index >= 26)
          index = 26;

      tabval = conv_table[index];
      if (tabval < 0)
      {
         putc(c, fpo);
      }
      else
      {
         fprintf(fpo, "%s", scalar2utf8(tabval + offset));
      }
   }

   return 0;
}

static void interpret_cmdline_options (int argc, char **argv,
    long *p_offset_lowercase, long *p_offset_uppercase)
{
   int asomtavruli = 0;
   int nuskhuri    = 0;
   int mtavruli    = 0;
   int hybrid      = 0;

   while (argc > 1 && argv[1][0] == '-')
   {
      if (!strcmp(argv[1], "--asomtavruli") ||
          !strcmp(argv[1],  "-a"))
        asomtavruli = 1;
      else if (!strcmp(argv[1], "--nuskhuri") ||
               !strcmp(argv[1],  "-n"))
        nuskhuri = 1;
      else if (!strcmp(argv[1], "--mtavruli") ||
               !strcmp(argv[1],  "-m"))
        mtavruli = 1;
      else if (!strcmp(argv[1], "--hybrid") ||
               !strcmp(argv[1],  "-h"))
        hybrid = 1;
      else
      {
         char *cp;
         for (cp = argv[1] + 1; *cp; cp++)
         {
            switch (*cp)
            {
               case 'a': asomtavruli = 1; break;
               case 'n': nuskhuri    = 1; break;
               case 'm': mtavruli    = 1; break;
               case 'h': hybrid      = 1; break;
            }
         }
      }

      argc--, argv++;
   }

   /* - Unicode starting point: 10D0 Mkhedruli
      - Unicode starting point: 10A0 Asomtavruli
      - Unicode starting point: 2D00 Nuskhuri
      - Unicode starting point: 1C90 Mtavruli */
   if (asomtavruli && nuskhuri)
   {
      *p_offset_uppercase = 0x10A0 - 0x10D0;
      *p_offset_lowercase = 0x2D00 - 0x10D0;
   }
   else if (asomtavruli)
   {
      *p_offset_uppercase = 0x10A0 - 0x10D0;
      *p_offset_lowercase = 0x10A0 - 0x10D0;
   }
   else if (nuskhuri)
   {
      *p_offset_uppercase = 0x2D00 - 0x10D0;
      *p_offset_lowercase = 0x2D00 - 0x10D0;
   }
   else if (mtavruli)
   {
      *p_offset_uppercase = 0x1C90 - 0x10D0;
      *p_offset_lowercase = 0x10D0 - 0x10D0;
   }
   else if (hybrid)
   {
      *p_offset_uppercase = 0x10A0 - 0x10D0;
      *p_offset_lowercase = 0x10D0 - 0x10D0;
   }
}


/* =========================================================
   Design considerations:

   See:
   https://en.wikipedia.org/wiki/Georgian_language#Phonology
   https://en.wikipedia.org/wiki/Georgian_scripts
   http://unicode.org/charts/PDF/U10A0.pdf

   There are also the Unicode pages
   http://unicode.org/charts/PDF/U10A0.pdf and
   http://unicode.org/charts/PDF/U2D00.pdf
   which are for characters used in a religious context, but
   I don't use any of those.

   Note 1:
   For p, t and k of Interlingua, it's possible to use the
   aspirated letters/sounds of the Georgian language, or
   the glottalised/ejective ones. Wikipedia says that
   "The glottalization of the ejectives is rather light, ..."
   and also considering that Georgian has no /f/, the letter
   for aspirated p would be a candidate for f in the Latin
   script normally used for Interlingua.

   Considering that, I decided to use the ejective letters
   p, t, k, and also ejective ts for the letter c, even if
   of course Interlingua has no ejective sounds, and c can
   sound as ts and k. But I ignore phonetics there, and
   transliterate just alphabetically.

   Note 2:
   For f, I could have use 10F6 ჶ GEORGIAN LETTER F, but it
   doesn't have a corresponding (ecclesiastic) uppercase form.
   That would be code 10C6, but nothing was defined for that.
   Therefore I use the codes for the aspirated p as Interlingua's
   f: 10E4.

   Note 3:
   The Georgian script in fact consists of four different kinds:

   - Unicode starting point: 10D0
     Mkhedruli, the normal script for everyday use. It rarely
     uses uppercase. In this program, this is the default, so
     'A' and 'a' are mapped to the same Georgian letter. This
     means the conversion is non-reversible, of course.

   - Unicode starting point: 10A0
     The oldest alphabet, for ecclesiastical alphabet, uppercase
     only. One style is called Asomtavruli.

   - Unicode starting point: 2D00
     Nuskhuri, a lowercase alphabet, for ecclesiastical use.

   - Unicode starting point: 1C90
     Mtavruli, an uppercase secular alphabet. Used for emphasis
     and names, similar to small caps in Latin script. In past
     sometimes also used in the same way as in the Latin, Greek
     and Cyrillic script, i.e. at the start of a sentence, and
     the start of a name.
     At least on my system (Linux Mint 18.3), this uppercase
     alphabet is not supported by system fonts.

   Options in this program ina-Georg:
   - no option, default value: Mkhedruli only.

   - --asomtavruli or -a: Asomtavruli only.

   - --nuskhuri or -n: Nuskhuri only.

   - Those last two options combined (may also be written as
     -n -a, -na, -an): Latin uppercase is mapped to Asomtavruli,
     and lowercase to Nuskhuri

   - --mtavruli or -m: Latin uppercase as Mtavruli, lowercase
     as Mkhedruli.

   - --hybrid or -h: Latin uppercase as Asomtavruli, lowercase
     as Mkhedruli. This is never seen in the wild, but it is
     nice to see how it looks, also because option -m causes
     characters not supported by system fonts (at least on my
     system as of 26 April 2020).
 */