Pro un explication, vide ina-Java.htm.
/* .
A converter from Latin script to Javanese
script, intended for Interlingua, perhaps also usable
for other languages.
Author: Ruud Harmsen.
Demo: https://rudhar.com/lingtics/intrlnga/scrptura/ina-Java.htm
Considerations, decisions, options: see the long comment
below the actual C-code. And some in the code.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "utftools.h"
int convtab[][2] =
{
/* See
https://rudhar.com/lingtics/uniclnkl.htm,
https://unicode.org/charts/PDF/UA980.pdf
https://en.wikipedia.org/wiki/Javanese_orthography
https://en.wikipedia.org/wiki/Javanese_script
https://r12a.github.io/scripts/javanese/
http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3319.pdf
https://lingojam.com/JavaneseScript (a converter,
but it doesn't work well).
*/
/* For vowels: dependent first, independent next.
For consonants, sometimes: diacritic last*/
/* a */ { 0xb4, 0x84}, /* b4 as dependent code only if no pangkon (c0) is used */
/* b */ { 0xa7, -1 },
/* c */ { 0x95, -1 },
/* d */ { 0xa2, -1 },
/* e */ { 0xbc, 0x8c}, /* ba = é (placed in front of the consonant), bc is a shwa and is above. */
/* f */ { 0xa5, 0xb3}, /* Three dotted p */
/* g */ { 0x92, -1 },
/* h */ { 0xb2, -1 },
/* i */ { 0xb6, 0x86},
/* j */ { 0x97, -1 },
/* k */ { 0x8f, -1 },
/* l */ { 0xad, -1 },
/* m */ { 0xa9, -1 },
/* n */ { 0xa4, -1 },
/* o */ { 0xb5, 0x8e}, /* Using b5 for o seems to be Sundanese, Javanese uses ba-b4. */
/* p */ { 0xa5, -1 },
/* q */ { 0x90, -1 }, /* q = "k Sasak" */
/* r */ { 0xab, -1 },
/* s */ { 0xb1, -1 },
/* t */ { 0xa0, -1 },
/* u */ { 0xb8, 0x88},
/* v */ { 0xae, 0xb3}, /* Three dotted w */
/* w */ { 0xae, -1 },
/* x */ { 0x8f, 0xb3}, /* Three dotted k, per https://r12a.github.io/scripts/javanese/block. Or use ks. */
/* y */ { 0xaa, -1 },
/* z */ { 0x97, 0xb3}, /* Three dotted j */
};
enum charclas_s
{
CHARCLAS_UNKNOWN,
CHARCLAS_DIGIT,
CHARCLAS_CONSONANT,
CHARCLAS_VOWEL,
CHARCLAS_OTHER,
};
enum charclas_s charclas_now = CHARCLAS_UNKNOWN;
enum charclas_s charclas_was = CHARCLAS_UNKNOWN;
static int convert (int c, FILE *fpi, FILE *fpo);
static int chrbase = 0xA900;
static int numbase = 0xA9D0;
/* Options */
static int pangkon = 0;
static int taling = 0;
static int mixed = 0;
void padapangkat_pangkon (void);
int main (int argc, char **argv)
{
FILE *fpi = stdin, *fpo = stdout;
int c;
int intag = 0, inentity = 0;
char *progname;
for (progname = argv[0]; argc > 1; argc--, argv++)
{
if (strcmp(argv[1], "--pangkon") == 0 ||
strcmp(argv[1], "-p" ) == 0)
{
pangkon = 1;
}
else if (strcmp(argv[1], "--taling") == 0 ||
strcmp(argv[1], "-t" ) == 0)
{
taling = 1;
}
else if (strcmp(argv[1], "--mixed") == 0 ||
strcmp(argv[1], "-m" ) == 0)
{
mixed = 1;
pangkon = 0;
taling = 0;
}
else
{
fprintf(stderr, "%s: Option/argument %s invalid\n", progname, argv[1]);
exit(1);
}
}
while ((c = getc(fpi)) != EOF)
{
if (!intag && c == '<')
intag = 1;
else if (intag && c == '>')
intag = 0;
else if (!inentity && c == '&')
inentity = 1;
else if (inentity && c == ';')
inentity = 0;
/* Known bug: style information in the HTML <head>, between
<style> and </style>, is also converted, although it should
not, being code. */
if (intag || inentity)
{
charclas_now = CHARCLAS_OTHER;
padapangkat_pangkon();
putc(c, fpo);
charclas_was = charclas_now;
}
else
{
convert(c, fpi, fpo);
}
}
/* End-of-file handling, if necessary */
if (c == EOF)
charclas_now = CHARCLAS_OTHER;
padapangkat_pangkon();
return 0;
}
static int convert (int c, FILE *fpi, FILE *fpo)
{
int index = 0, tabval0 = 0, tabval1 = 0;
/* The Javanese script has no lowercase-uppercase distinction,
except in complicated ways that I don't use. */
c = tolower(c);
if (mixed && (c == 'v' || c == 'w'))
{
/* Swap v and w, to minimise the number of three-dot
diacritics (cecak telu). */
c = (c == 'v' ? 'w' : 'v');
}
if (isdigit(c))
charclas_now = CHARCLAS_DIGIT;
else if (!isascii(c) || !isalpha(c))
charclas_now = CHARCLAS_OTHER;
else if (strchr("aeiou", c))
charclas_now = CHARCLAS_VOWEL;
else
charclas_now = CHARCLAS_CONSONANT;
padapangkat_pangkon();
/* The Javanese script has no lowercase-uppercase distinction,
except in a rather complicated way that I ignore (see
https://en.wikipedia.org/wiki/Javanese_script#Pasangan,
pasangan, murda, mahaprana */
index = c - 'a';
/* Safety first */
if (index <= 0 || index > sizeof convtab / sizeof convtab[0])
index = 0;
tabval0 = convtab[index][0];
tabval1 = convtab[index][1];
switch (charclas_now)
{
case CHARCLAS_OTHER:
putc(c, fpo);
break;
case CHARCLAS_DIGIT:
if (charclas_was != CHARCLAS_DIGIT)
{
/* Write a "pada pangkat" to mark the start of a number (digit sequence) */
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc7));
}
fprintf(stdout, "%s", scalar2utf8(numbase + (c - '0')));
break;
case CHARCLAS_VOWEL:
if (charclas_was == CHARCLAS_CONSONANT)
{
/* Write dependent vowel value, except for implicit vowel a */
if (c != 'a' || !pangkon)
{
if (c == 'e' && (taling || mixed))
{
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba));
}
else if (c == 'o' && taling)
{
/* Function scalar2utf8 uses a static buffer, so cannot
be safely called twice in an argument list */
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba));
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xb4));
}
else
{
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
}
}
}
else
{
if (!mixed)
{
/* Write independent vowel value */
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1));
}
else /* Option mixed is on */
{
if (charclas_was == CHARCLAS_VOWEL || c == 'a')
{
fprintf(stdout, "%s", scalar2utf8(chrbase + convtab['h'-'a'][0]));
if (c != 'e')
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
else
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba));
}
else
{
/* Write independent vowel value */
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1));
}
}
}
break;
case CHARCLAS_CONSONANT:
if (mixed && c == 'h')
{
/* Write a three dotted h, to distinguish it from h
used as a carrier for a dependent vowel sign. */
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xb3));
}
else
{
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0));
if (tabval1 >= 0)
fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1));
}
break;
}
charclas_was = charclas_now;
return 0;
}
void padapangkat_pangkon (void)
{
if (pangkon && charclas_was == CHARCLAS_CONSONANT && charclas_now != CHARCLAS_VOWEL)
{
/* Write a "pangkon" (virama) to mark the non-presence of a vowel after
the consonant */
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc0));
}
else if (charclas_was == CHARCLAS_DIGIT && charclas_now != CHARCLAS_DIGIT)
{
/* Write a "pada pangkat" to mark the end of a number (digit sequence) */
fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc7));
}
}
/*============
REFERENCE FILES
See
https://rudhar.com/lingtics/uniclnkl.htm,
https://unicode.org/charts/PDF/UA980.pdf
https://en.wikipedia.org/wiki/Javanese_orthography
https://en.wikipedia.org/wiki/Javanese_script
https://r12a.github.io/scripts/javanese/
http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3319.pdf
https://lingojam.com/JavaneseScript (a converter,
but it doesn't work well).
PURPOSE
NB This converter is NOT suitable for real Javanese in Latin
script, because no Latin digraphs like dh and th (for retroflex
plosives), or ny and ng (for palatal and velar nasals) are
recognised. The conversion is purely letter by letter. Also
the signs è and é are not recognised. This converter is meant
for Interlingua in pure ASCII only. (And perhaps English, Dutch,
etc.)
Also, the purpose is more to enjoy the beauty of the Javanese
script (in some fonts; Linux Mint's system font, whatever that
is, looks nice!), and to learn more about it, without knowing
or learning the language itself. Any serious use is intended
nor expected.
PANGKON
Using the pangkon (Sanskrit: virama; code A9C0), to suppress
the implicit vowel 'short a' after each consonant, creates
a very confused image in text converted from Interlingua in
Latin script, because there are too many occurrences of pangkon
in Interlingua. The pangkon often causes the next character
(pasangan) to be _under_ the preceding consonant (wyanjana),
which is beautiful if it happens occasionally, but not if it
happens too often.
In real Javanese (as said, not supported by this converter),
there are fewer clusters consonant-consonant, there probably
aren't any threefold sequences of consonants, like str in
European languages. For final h, r and ng without a following
vowel, special diacritics exist, so no pangkon is used for
those in Javanese.
To overcome this difficulty, pangkon is only used optionally
(--pangkon or -p). The default is not to use it, interpreting
the lack of a dependent vowel sign as indeed a missing vowel.
That means the letter a after a consonant must now be implicitly
written. For that I use code b4 for long a. This is similar to
what I did earlier in ina-Deva.c . Of course, this defies the
logic of Brahmic scripts, but results in more beautiful converted
text. And that's why I built this converter in the first place:
to enjoy the beauty of the Javanese script.
Yet another reason why the use of pangkon causes problems, is
that I use spaces between words. After a word that ends in a
consonant, and before the space (or comma, stop, etc.) that
follows, I have to insert a pangkon to indicate there is now vowel
'a'. This pangkon will be displayed as a separate sign. In real
Javanese, that happens only at the end of a sentence, because
real Javanese in Javanese script uses no spaces between the words.
(Javanese in Latin script does use them.)
Quote from
https://en.wikipedia.org/wiki/Javanese_script#Characteristics
"Text is written from left to right and without word boundaries
(Scriptio continua)."
https://en.wikipedia.org/wiki/Scriptio_continua, more specifically
https://en.wikipedia.org/wiki/Scriptio_continua#Javanese_script .
Compare:
https://en.wikipedia.org/wiki/Javanese_script#Font
Is that the same, and the same in all fonts?
VOWELS E AND O - DEPENDENT SYMBOLS
Until April 8/9, 2020 -- and it will remain the default --
for the vowel 'e' I used hexadecimal A9BC, Javanese vowel sign
pepet. That is not really correct, as it seems to indicate a
shwa-like vowel in Javanese, which in Interlingua should not be
sounded (although it sometimes is).
Also, for the vowel o, I use code A9B5, Javanese vowel sign tolong.
However, that seems to be a Sundanese practice, says
https://en.wikipedia.org/wiki/Javanese_script#Sundanese_language
I am not sure if that is correct, because
https://en.wikipedia.org/wiki/Sundanese_language
does not mention the use of the Javanese script for the Sundanese
language, and Sundanese has its own script, which not only looks
quite different, but is also organised very differently as regards
the code page: https://www.unicode.org/charts/PDF/U1B80.pdf
Anyway, it seems in real Javanese the vowels e and o (two of each,
but not distinguished in the script?) are indicated in a different
way: 'e' is A9BA Javanese vowel sign taling (note that the code is
after the consonant, but the sign appears before it!).
The vowel 'o' is that same A9BA, but followed by an A9B4, vowel
sign tarung. This otherwise means long a, for Sanskrit etc., but
in Javanese, it does not occur as such. In my conversion (if without
pangkon), I use it for every 'vowel a' after a consonant, though.
Note that for example 'no' is encoded as nea = A9A4-A9BA-A9B4, but
again, the font software renders it such that the signs of A9BA, then
A9A4, then A9B4 appear, so more like ena = no.
On 9 April 2020, I added the option --taling or -t to use that
style, instead of the default e = pepet and o = tolong.
By the way, could it be that these 'double' vowel diacritics for
'o' have their roots in Devanagari code 094E? "Devanagari vowel sign
prishthamatra E; character has historic use only; combines with E
to form AI, with AA to form O, and with O to form AU".
See https://unicode.org/charts/PDF/U0900.pdf .
VOWELS - INDEPENDENT SYMBOLS
Note also that native Javanese in Javanese script does not use the
independent vowel signs (swara) for word-initial vowels. Instead,
it uses the consonant h as a carrier, and adds the vowel diacritic
(dependent vowel sign -- sandhangan swara) to that. This is possible
because from texts I saw (like in the Javanese Wikipedia), it seems
the letter h never occurs initially. The only h's there are, are
either final (indicated by sesigeg wignyan = visarga), or part of the
digraphs th and dh which denote retroflex plosives.
In Interlingua however (and many other European languages) I cannot
use h like that, because initial h AND word-initial vowels occur, and
are abundant. Just take the frequent words 'a' and 'ha' as examples.
So I do use independent vowel signs (swara) for initial vowels,
(like in Sundanese, it seems) and also for anything else that isn't
after a consonant. (So it's always after another vowel? I think so.)
MAKING IT QUIETER?
(12/13 April 2020)
Possible optional enhancements, not (yet?) implemented):
1.
Swap v and w, so put the three dots (cecak telu) on the w, not the v,
so w becomes a rekan. The letter w is quite infrequent in Interlingua,
but v is frequent, so this creates a quieter and more uniform looking
text. Cecak telu does look good, except in combination with other
diacritics, such as wulu (i) and pepet (shwa, e)(if used).
2.
It would be possible to avoid using the independent vowel signs (which
are also visually unquiet, and in the case of vowel a, not beautiful,
to my taste), by doing the same as what is done in real Javanese: use
h as a carrier consonant. Then a separate sign would be needed for when h
in Interlingua actually occurs. That could be h with cecak telu, which
is an existing sign for the Arabic "hard h" in loanwords (although in
Interlingua the letter h does not sound like that).
Measures 1. and 2. above would be contradictory, of course, in that
one removes a frequent cecak telu, while the other introduces a new
one. But h in Interlingua is not very frequent, much less so than in
English, for example.
13 April 2020:
A compromise, or mixed solution, complicated but hopefully beautiful
(option --mixed, -m):
- Swap v and w, as above.
- Interlingua h becomes h cecak telu.
- Word-initial vowel: for 'a' use 'ha', for other vowels: use
independent vowel sign.
- Vowel that follows a vowel: always use h as a carrier, never use
independent vowel signs.
- After a consonant:
- For vowel 'a' use tarung.
- For vowel 'e' use taling, not pepet.
- For vowel 'o' use tolong (A9B5), not taling-tarung.
*/