Interlingua in Javanese script

Pro un explication, vide ina-Java.htm.
/* 6-15 April 2020. A converter from Latin script to Javanese
   script, intended for Interlingua, perhaps also usable
   for other languages.

   Author: Ruud Harmsen.
   Demo:   https://rudhar.com/lingtics/intrlnga/scrptura/ina-Java.htm

   Considerations, decisions, options: see the long comment
   below the actual C-code. And some in the code.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "utftools.h"

int convtab[][2] =
{
   /* See
      https://rudhar.com/lingtics/uniclnkl.htm,
      https://unicode.org/charts/PDF/UA980.pdf
      https://en.wikipedia.org/wiki/Javanese_orthography
      https://en.wikipedia.org/wiki/Javanese_script
      https://r12a.github.io/scripts/javanese/
      http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3319.pdf

      https://lingojam.com/JavaneseScript (a converter,
      but it doesn't work well).
    */

   /* For vowels: dependent first, independent next.
      For consonants, sometimes: diacritic last*/
   /* a */ { 0xb4, 0x84}, /* b4 as dependent code only if no pangkon (c0) is used */
   /* b */ { 0xa7, -1  },
   /* c */ { 0x95, -1  },
   /* d */ { 0xa2, -1  },
   /* e */ { 0xbc, 0x8c}, /* ba = é (placed in front of the consonant), bc is a shwa and is above. */
   /* f */ { 0xa5, 0xb3}, /* Three dotted p */
   /* g */ { 0x92, -1  },
   /* h */ { 0xb2, -1  },
   /* i */ { 0xb6, 0x86},
   /* j */ { 0x97, -1  },
   /* k */ { 0x8f, -1  },
   /* l */ { 0xad, -1  },
   /* m */ { 0xa9, -1  },
   /* n */ { 0xa4, -1  },
   /* o */ { 0xb5, 0x8e}, /* Using b5 for o seems to be Sundanese, Javanese uses ba-b4. */
   /* p */ { 0xa5, -1  },
   /* q */ { 0x90, -1  }, /* q = "k Sasak" */
   /* r */ { 0xab, -1  },
   /* s */ { 0xb1, -1  },
   /* t */ { 0xa0, -1  },
   /* u */ { 0xb8, 0x88},
   /* v */ { 0xae, 0xb3}, /* Three dotted w */
   /* w */ { 0xae, -1  },
   /* x */ { 0x8f, 0xb3}, /* Three dotted k, per https://r12a.github.io/scripts/javanese/block. Or use ks. */
   /* y */ { 0xaa, -1  },
   /* z */ { 0x97, 0xb3}, /* Three dotted j */
};

enum charclas_s
{
   CHARCLAS_UNKNOWN,
   CHARCLAS_DIGIT,
   CHARCLAS_CONSONANT,
   CHARCLAS_VOWEL,
   CHARCLAS_OTHER,
};
enum charclas_s charclas_now = CHARCLAS_UNKNOWN;
enum charclas_s charclas_was = CHARCLAS_UNKNOWN;

static int convert (int c, FILE *fpi, FILE *fpo);
static int chrbase = 0xA900;
static int numbase = 0xA9D0;
/* Options */
static int pangkon = 0;
static int taling  = 0;
static int mixed   = 0;

void padapangkat_pangkon (void);

int  main (int argc, char **argv)
{
   FILE *fpi = stdin, *fpo = stdout;
   int c;
   int intag = 0, inentity = 0;
   char *progname;

   for (progname = argv[0]; argc > 1; argc--, argv++)
   {
      if (strcmp(argv[1], "--pangkon") == 0 ||
          strcmp(argv[1],  "-p"      ) == 0)
      {
         pangkon = 1;
      }
      else if (strcmp(argv[1], "--taling") == 0 ||
               strcmp(argv[1],  "-t"     ) == 0)
      {
         taling = 1;
      }
      else if (strcmp(argv[1], "--mixed") == 0 ||
               strcmp(argv[1],  "-m"    ) == 0)
      {
         mixed   = 1;
         pangkon = 0;
         taling  = 0;
      }
      else
      {
         fprintf(stderr, "%s: Option/argument %s invalid\n", progname, argv[1]);
         exit(1);
      }
   }

   while ((c = getc(fpi)) != EOF)
   {
      if (!intag && c == '<')
         intag = 1;
      else if (intag && c == '>')
         intag = 0;
      else if (!inentity && c == '&')
         inentity = 1;
      else if (inentity && c == ';')
         inentity = 0;
      /* Known bug: style information in the HTML <head>, between
         <style> and </style>, is also converted, although it should
         not, being code. */

      if (intag || inentity)
      {
         charclas_now = CHARCLAS_OTHER;
         padapangkat_pangkon();
         putc(c, fpo);
         charclas_was = charclas_now;
      }
      else
      {
         convert(c, fpi, fpo);
      }
   }

   /* End-of-file handling, if necessary */
   if (c == EOF)
      charclas_now = CHARCLAS_OTHER;
   padapangkat_pangkon();

   return 0;
}

static int convert (int c, FILE *fpi, FILE *fpo)
{
   int index = 0, tabval0 = 0, tabval1 = 0;

   /* The Javanese script has no lowercase-uppercase distinction,
      except in complicated ways that I don't use. */
   c = tolower(c);

   if (mixed && (c == 'v' || c == 'w'))
   {
      /* Swap v and w, to minimise the number of three-dot
         diacritics (cecak telu). */
      c = (c == 'v' ? 'w' : 'v');
   }

   if (isdigit(c))
      charclas_now = CHARCLAS_DIGIT;
   else if (!isascii(c) || !isalpha(c))
      charclas_now = CHARCLAS_OTHER;
   else if (strchr("aeiou", c))
      charclas_now = CHARCLAS_VOWEL;
   else
      charclas_now = CHARCLAS_CONSONANT;

   padapangkat_pangkon();

   /* The Javanese script has no lowercase-uppercase distinction,
      except in a rather complicated way that I ignore (see
      https://en.wikipedia.org/wiki/Javanese_script#Pasangan,
      pasangan, murda, mahaprana */
   index = c - 'a';
   /* Safety first */
   if (index <= 0 || index > sizeof convtab / sizeof convtab[0])
      index = 0;

   tabval0 = convtab[index][0];
   tabval1 = convtab[index][1];

   switch (charclas_now)
   {
      case CHARCLAS_OTHER:
         putc(c, fpo);
         break;

      case CHARCLAS_DIGIT:
         if (charclas_was != CHARCLAS_DIGIT)
         {
            /* Write a "pada pangkat" to mark the start of a number (digit sequence) */
            fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc7));
         }
         fprintf(stdout, "%s", scalar2utf8(numbase + (c - '0')));
         break;

      case CHARCLAS_VOWEL:
         if (charclas_was == CHARCLAS_CONSONANT)
         {
            /* Write dependent vowel value, except for implicit vowel a */
            if (c != 'a' || !pangkon)
            {
               if (c == 'e' && (taling || mixed))
               {
                  fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba));
               }
               else if (c == 'o' && taling)
               {
                  /* Function scalar2utf8 uses a static buffer, so cannot
                     be safely called twice in an argument list */
                  fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba));
                  fprintf(stdout, "%s", scalar2utf8(chrbase + 0xb4));
               }
               else
               {
                  fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
               }
            }
         }
         else
         {
            if (!mixed)
            {
               /* Write independent vowel value */
               fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1));
            }
            else /* Option mixed is on */
            {
               if (charclas_was == CHARCLAS_VOWEL || c == 'a')
               {
                  fprintf(stdout, "%s", scalar2utf8(chrbase + convtab['h'-'a'][0]));
                  if (c != 'e')
                     fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
                  else
                     fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba));
               }
               else
               {
                  /* Write independent vowel value */
                  fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1));
               }
            }
         }
         break;

      case CHARCLAS_CONSONANT:
         if (mixed && c == 'h')
         {
            /* Write a three dotted h, to distinguish it from h
               used as a carrier for a dependent vowel sign. */
            fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
            fprintf(stdout, "%s", scalar2utf8(chrbase + 0xb3));
         }
         else
         {
            fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
            if (tabval1 >= 0)
               fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1));
         }
         break;
   }

   charclas_was = charclas_now;

   return 0;
}

void padapangkat_pangkon (void)
{
   if (pangkon && charclas_was == CHARCLAS_CONSONANT && charclas_now != CHARCLAS_VOWEL)
   {
      /* Write a "pangkon" (virama) to mark the non-presence of a vowel after
         the consonant */
      fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc0));
   }
   else if (charclas_was == CHARCLAS_DIGIT && charclas_now != CHARCLAS_DIGIT)
   {
      /* Write a "pada pangkat" to mark the end of a number (digit sequence) */
      fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc7));
   }
}


/*============
   REFERENCE FILES

   See
   https://rudhar.com/lingtics/uniclnkl.htm,
   https://unicode.org/charts/PDF/UA980.pdf
   https://en.wikipedia.org/wiki/Javanese_orthography
   https://en.wikipedia.org/wiki/Javanese_script
   https://r12a.github.io/scripts/javanese/
   http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3319.pdf

   https://lingojam.com/JavaneseScript (a converter,
   but it doesn't work well).

   PURPOSE

   NB This converter is NOT suitable for real Javanese in Latin
   script, because no Latin digraphs like dh and th (for retroflex
   plosives), or ny and ng (for palatal and velar nasals) are
   recognised. The conversion is purely letter by letter. Also
   the signs è and é are not recognised. This converter is meant
   for Interlingua in pure ASCII only. (And perhaps English, Dutch,
   etc.)

   Also, the purpose is more to enjoy the beauty of the Javanese
   script (in some fonts; Linux Mint's system font, whatever that
   is, looks nice!), and to learn more about it, without knowing
   or learning the language itself. Any serious use is intended
   nor expected.

   PANGKON

   Using the pangkon (Sanskrit: virama; code A9C0), to suppress
   the implicit vowel 'short a' after each consonant, creates
   a very confused image in text converted from Interlingua in
   Latin script, because there are too many occurrences of pangkon
   in Interlingua. The pangkon often causes the next character
   (pasangan) to be _under_ the preceding consonant (wyanjana),
   which is beautiful if it happens occasionally, but not if it
   happens too often.

   In real Javanese (as said, not supported by this converter),
   there are fewer clusters consonant-consonant, there probably
   aren't any threefold sequences of consonants, like str in
   European languages. For final h, r and ng without a following
   vowel, special diacritics exist, so no pangkon is used for
   those in Javanese.

   To overcome this difficulty, pangkon is only used optionally
   (--pangkon or -p). The default is not to use it, interpreting
   the lack of a dependent vowel sign as indeed a missing vowel.
   That means the letter a after a consonant must now be implicitly
   written. For that I use code b4 for long a. This is similar to
   what I did earlier in ina-Deva.c . Of course, this defies the
   logic of Brahmic scripts, but results in more beautiful converted
   text. And that's why I built this converter in the first place:
   to enjoy the beauty of the Javanese script.

   Yet another reason why the use of pangkon causes problems, is
   that I use spaces between words. After a word that ends in a
   consonant, and before the space (or comma, stop, etc.) that
   follows, I have to insert a pangkon to indicate there is now vowel
   'a'. This pangkon will be displayed as a separate sign. In real
   Javanese, that happens only at the end of a sentence, because
   real Javanese in Javanese script uses no spaces between the words.
   (Javanese in Latin script does use them.)
   Quote from
   https://en.wikipedia.org/wiki/Javanese_script#Characteristics
   "Text is written from left to right and without word boundaries
   (Scriptio continua)."
   https://en.wikipedia.org/wiki/Scriptio_continua, more specifically
   https://en.wikipedia.org/wiki/Scriptio_continua#Javanese_script .
   Compare:
   https://en.wikipedia.org/wiki/Javanese_script#Font
   Is that the same, and the same in all fonts?

   VOWELS E AND O - DEPENDENT SYMBOLS

   Until April 8/9, 2020 -- and it will remain the default --
   for the vowel 'e' I used hexadecimal A9BC, Javanese vowel sign
   pepet. That is not really correct, as it seems to indicate a
   shwa-like vowel in Javanese, which in Interlingua should not be
   sounded (although it sometimes is).

   Also, for the vowel o, I use code A9B5, Javanese vowel sign tolong.
   However, that seems to be a Sundanese practice, says
   https://en.wikipedia.org/wiki/Javanese_script#Sundanese_language
   I am not sure if that is correct, because
   https://en.wikipedia.org/wiki/Sundanese_language
   does not mention the use of the Javanese script for the Sundanese
   language, and Sundanese has its own script, which not only looks
   quite different, but is also organised very differently as regards
   the code page: https://www.unicode.org/charts/PDF/U1B80.pdf

   Anyway, it seems in real Javanese the vowels e and o (two of each,
   but not distinguished in the script?) are indicated in a different
   way: 'e' is A9BA Javanese vowel sign taling (note that the code is
   after the consonant, but the sign appears before it!).

   The vowel 'o' is that same A9BA, but followed by an A9B4, vowel
   sign tarung. This otherwise means long a, for Sanskrit etc., but
   in Javanese, it does not occur as such. In my conversion (if without
   pangkon), I use it for every 'vowel a' after a consonant, though.

   Note that for example 'no' is encoded as nea = A9A4-A9BA-A9B4, but
   again, the font software renders it such that the signs of A9BA, then
   A9A4, then A9B4 appear, so more like ena = no.

   On 9 April 2020, I added the option --taling or -t to use that
   style, instead of the default e = pepet and o = tolong.

   By the way, could it be that these 'double' vowel diacritics for
   'o' have their roots in Devanagari code 094E? "Devanagari vowel sign
   prishthamatra E; character has historic use only; combines with E
   to form AI, with AA to form O, and with O to form AU".
   See https://unicode.org/charts/PDF/U0900.pdf .

   VOWELS - INDEPENDENT SYMBOLS

   Note also that native Javanese in Javanese script does not use the
   independent vowel signs (swara) for word-initial vowels. Instead,
   it uses the consonant h as a carrier, and adds the vowel diacritic
   (dependent vowel sign -- sandhangan swara) to that. This is possible
   because from texts I saw (like in the Javanese Wikipedia), it seems
   the letter h never occurs initially. The only h's there are, are
   either final (indicated by sesigeg wignyan = visarga), or part of the
   digraphs th and dh which denote retroflex plosives.

   In Interlingua however (and many other European languages) I cannot
   use h like that, because initial h AND word-initial vowels occur, and
   are abundant. Just take the frequent words 'a' and 'ha' as examples.
   So I do use independent vowel signs (swara) for initial vowels,
   (like in Sundanese, it seems) and also for anything else that isn't
   after a consonant. (So it's always after another vowel? I think so.)

   MAKING IT QUIETER?

   (12/13 April 2020)
   Possible optional enhancements, not (yet?) implemented):

   1.
   Swap v and w, so put the three dots (cecak telu) on the w, not the v,
   so w becomes a rekan. The letter w is quite infrequent in Interlingua,
   but v is frequent, so this creates a quieter and more uniform looking
   text. Cecak telu does look good, except in combination with other
   diacritics, such as wulu (i) and pepet (shwa, e)(if used).

   2.
   It would be possible to avoid using the independent vowel signs (which
   are also visually unquiet, and in the case of vowel a, not beautiful,
   to my taste), by doing the same as what is done in real Javanese: use
   h as a carrier consonant. Then a separate sign would be needed for when h
   in Interlingua actually occurs. That could be h with cecak telu, which
   is an existing sign for the Arabic "hard h" in loanwords (although in
   Interlingua the letter h does not sound like that).

   Measures 1. and 2. above would be contradictory, of course, in that
   one removes a frequent cecak telu, while the other introduces a new
   one. But h in Interlingua is not very frequent, much less so than in
   English, for example.

   13 April 2020:
   A compromise, or mixed solution, complicated but hopefully beautiful
   (option --mixed, -m):
   - Swap v and w, as above.
   - Interlingua h becomes h cecak telu.
   - Word-initial vowel: for 'a' use 'ha', for other vowels: use
     independent vowel sign.
   - Vowel that follows a vowel: always use h as a carrier, never use
     independent vowel signs.
   - After a consonant:
     - For vowel 'a' use tarung.
     - For vowel 'e' use taling, not pepet.
     - For vowel 'o' use tolong (A9B5), not taling-tarung.
 */