Pro un explication, vide ina-Java.htm.
/* . A converter from Latin script to Javanese script, intended for Interlingua, perhaps also usable for other languages. Author: Ruud Harmsen. Demo: https://rudhar.com/lingtics/intrlnga/scrptura/ina-Java.htm Considerations, decisions, options: see the long comment below the actual C-code. And some in the code. */ #include <stdlib.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include "utftools.h" int convtab[][2] = { /* See https://rudhar.com/lingtics/uniclnkl.htm, https://unicode.org/charts/PDF/UA980.pdf https://en.wikipedia.org/wiki/Javanese_orthography https://en.wikipedia.org/wiki/Javanese_script https://r12a.github.io/scripts/javanese/ http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3319.pdf https://lingojam.com/JavaneseScript (a converter, but it doesn't work well). */ /* For vowels: dependent first, independent next. For consonants, sometimes: diacritic last*/ /* a */ { 0xb4, 0x84}, /* b4 as dependent code only if no pangkon (c0) is used */ /* b */ { 0xa7, -1 }, /* c */ { 0x95, -1 }, /* d */ { 0xa2, -1 }, /* e */ { 0xbc, 0x8c}, /* ba = é (placed in front of the consonant), bc is a shwa and is above. */ /* f */ { 0xa5, 0xb3}, /* Three dotted p */ /* g */ { 0x92, -1 }, /* h */ { 0xb2, -1 }, /* i */ { 0xb6, 0x86}, /* j */ { 0x97, -1 }, /* k */ { 0x8f, -1 }, /* l */ { 0xad, -1 }, /* m */ { 0xa9, -1 }, /* n */ { 0xa4, -1 }, /* o */ { 0xb5, 0x8e}, /* Using b5 for o seems to be Sundanese, Javanese uses ba-b4. */ /* p */ { 0xa5, -1 }, /* q */ { 0x90, -1 }, /* q = "k Sasak" */ /* r */ { 0xab, -1 }, /* s */ { 0xb1, -1 }, /* t */ { 0xa0, -1 }, /* u */ { 0xb8, 0x88}, /* v */ { 0xae, 0xb3}, /* Three dotted w */ /* w */ { 0xae, -1 }, /* x */ { 0x8f, 0xb3}, /* Three dotted k, per https://r12a.github.io/scripts/javanese/block. Or use ks. */ /* y */ { 0xaa, -1 }, /* z */ { 0x97, 0xb3}, /* Three dotted j */ }; enum charclas_s { CHARCLAS_UNKNOWN, CHARCLAS_DIGIT, CHARCLAS_CONSONANT, CHARCLAS_VOWEL, CHARCLAS_OTHER, }; enum charclas_s charclas_now = CHARCLAS_UNKNOWN; enum charclas_s charclas_was = CHARCLAS_UNKNOWN; static int convert (int c, FILE *fpi, FILE *fpo); static int chrbase = 0xA900; static int numbase = 0xA9D0; /* Options */ static int pangkon = 0; static int taling = 0; static int mixed = 0; void padapangkat_pangkon (void); int main (int argc, char **argv) { FILE *fpi = stdin, *fpo = stdout; int c; int intag = 0, inentity = 0; char *progname; for (progname = argv[0]; argc > 1; argc--, argv++) { if (strcmp(argv[1], "--pangkon") == 0 || strcmp(argv[1], "-p" ) == 0) { pangkon = 1; } else if (strcmp(argv[1], "--taling") == 0 || strcmp(argv[1], "-t" ) == 0) { taling = 1; } else if (strcmp(argv[1], "--mixed") == 0 || strcmp(argv[1], "-m" ) == 0) { mixed = 1; pangkon = 0; taling = 0; } else { fprintf(stderr, "%s: Option/argument %s invalid\n", progname, argv[1]); exit(1); } } while ((c = getc(fpi)) != EOF) { if (!intag && c == '<') intag = 1; else if (intag && c == '>') intag = 0; else if (!inentity && c == '&') inentity = 1; else if (inentity && c == ';') inentity = 0; /* Known bug: style information in the HTML <head>, between <style> and </style>, is also converted, although it should not, being code. */ if (intag || inentity) { charclas_now = CHARCLAS_OTHER; padapangkat_pangkon(); putc(c, fpo); charclas_was = charclas_now; } else { convert(c, fpi, fpo); } } /* End-of-file handling, if necessary */ if (c == EOF) charclas_now = CHARCLAS_OTHER; padapangkat_pangkon(); return 0; } static int convert (int c, FILE *fpi, FILE *fpo) { int index = 0, tabval0 = 0, tabval1 = 0; /* The Javanese script has no lowercase-uppercase distinction, except in complicated ways that I don't use. */ c = tolower(c); if (mixed && (c == 'v' || c == 'w')) { /* Swap v and w, to minimise the number of three-dot diacritics (cecak telu). */ c = (c == 'v' ? 'w' : 'v'); } if (isdigit(c)) charclas_now = CHARCLAS_DIGIT; else if (!isascii(c) || !isalpha(c)) charclas_now = CHARCLAS_OTHER; else if (strchr("aeiou", c)) charclas_now = CHARCLAS_VOWEL; else charclas_now = CHARCLAS_CONSONANT; padapangkat_pangkon(); /* The Javanese script has no lowercase-uppercase distinction, except in a rather complicated way that I ignore (see https://en.wikipedia.org/wiki/Javanese_script#Pasangan, pasangan, murda, mahaprana */ index = c - 'a'; /* Safety first */ if (index <= 0 || index > sizeof convtab / sizeof convtab[0]) index = 0; tabval0 = convtab[index][0]; tabval1 = convtab[index][1]; switch (charclas_now) { case CHARCLAS_OTHER: putc(c, fpo); break; case CHARCLAS_DIGIT: if (charclas_was != CHARCLAS_DIGIT) { /* Write a "pada pangkat" to mark the start of a number (digit sequence) */ fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc7)); } fprintf(stdout, "%s", scalar2utf8(numbase + (c - '0'))); break; case CHARCLAS_VOWEL: if (charclas_was == CHARCLAS_CONSONANT) { /* Write dependent vowel value, except for implicit vowel a */ if (c != 'a' || !pangkon) { if (c == 'e' && (taling || mixed)) { fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba)); } else if (c == 'o' && taling) { /* Function scalar2utf8 uses a static buffer, so cannot be safely called twice in an argument list */ fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba)); fprintf(stdout, "%s", scalar2utf8(chrbase + 0xb4)); } else { fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0)); } } } else { if (!mixed) { /* Write independent vowel value */ fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1)); } else /* Option mixed is on */ { if (charclas_was == CHARCLAS_VOWEL || c == 'a') { fprintf(stdout, "%s", scalar2utf8(chrbase + convtab['h'-'a'][0])); if (c != 'e') fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0)); else fprintf(stdout, "%s", scalar2utf8(chrbase + 0xba)); } else { /* Write independent vowel value */ fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1)); } } } break; case CHARCLAS_CONSONANT: if (mixed && c == 'h') { /* Write a three dotted h, to distinguish it from h used as a carrier for a dependent vowel sign. */ fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0)); fprintf(stdout, "%s", scalar2utf8(chrbase + 0xb3)); } else { fprintf(stdout, "%s", scalar2utf8(chrbase + tabval0)); if (tabval1 >= 0) fprintf(stdout, "%s", scalar2utf8(chrbase + tabval1)); } break; } charclas_was = charclas_now; return 0; } void padapangkat_pangkon (void) { if (pangkon && charclas_was == CHARCLAS_CONSONANT && charclas_now != CHARCLAS_VOWEL) { /* Write a "pangkon" (virama) to mark the non-presence of a vowel after the consonant */ fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc0)); } else if (charclas_was == CHARCLAS_DIGIT && charclas_now != CHARCLAS_DIGIT) { /* Write a "pada pangkat" to mark the end of a number (digit sequence) */ fprintf(stdout, "%s", scalar2utf8(chrbase + 0xc7)); } } /*============ REFERENCE FILES See https://rudhar.com/lingtics/uniclnkl.htm, https://unicode.org/charts/PDF/UA980.pdf https://en.wikipedia.org/wiki/Javanese_orthography https://en.wikipedia.org/wiki/Javanese_script https://r12a.github.io/scripts/javanese/ http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3319.pdf https://lingojam.com/JavaneseScript (a converter, but it doesn't work well). PURPOSE NB This converter is NOT suitable for real Javanese in Latin script, because no Latin digraphs like dh and th (for retroflex plosives), or ny and ng (for palatal and velar nasals) are recognised. The conversion is purely letter by letter. Also the signs è and é are not recognised. This converter is meant for Interlingua in pure ASCII only. (And perhaps English, Dutch, etc.) Also, the purpose is more to enjoy the beauty of the Javanese script (in some fonts; Linux Mint's system font, whatever that is, looks nice!), and to learn more about it, without knowing or learning the language itself. Any serious use is intended nor expected. PANGKON Using the pangkon (Sanskrit: virama; code A9C0), to suppress the implicit vowel 'short a' after each consonant, creates a very confused image in text converted from Interlingua in Latin script, because there are too many occurrences of pangkon in Interlingua. The pangkon often causes the next character (pasangan) to be _under_ the preceding consonant (wyanjana), which is beautiful if it happens occasionally, but not if it happens too often. In real Javanese (as said, not supported by this converter), there are fewer clusters consonant-consonant, there probably aren't any threefold sequences of consonants, like str in European languages. For final h, r and ng without a following vowel, special diacritics exist, so no pangkon is used for those in Javanese. To overcome this difficulty, pangkon is only used optionally (--pangkon or -p). The default is not to use it, interpreting the lack of a dependent vowel sign as indeed a missing vowel. That means the letter a after a consonant must now be implicitly written. For that I use code b4 for long a. This is similar to what I did earlier in ina-Deva.c . Of course, this defies the logic of Brahmic scripts, but results in more beautiful converted text. And that's why I built this converter in the first place: to enjoy the beauty of the Javanese script. Yet another reason why the use of pangkon causes problems, is that I use spaces between words. After a word that ends in a consonant, and before the space (or comma, stop, etc.) that follows, I have to insert a pangkon to indicate there is now vowel 'a'. This pangkon will be displayed as a separate sign. In real Javanese, that happens only at the end of a sentence, because real Javanese in Javanese script uses no spaces between the words. (Javanese in Latin script does use them.) Quote from https://en.wikipedia.org/wiki/Javanese_script#Characteristics "Text is written from left to right and without word boundaries (Scriptio continua)." https://en.wikipedia.org/wiki/Scriptio_continua, more specifically https://en.wikipedia.org/wiki/Scriptio_continua#Javanese_script . Compare: https://en.wikipedia.org/wiki/Javanese_script#Font Is that the same, and the same in all fonts? VOWELS E AND O - DEPENDENT SYMBOLS Until April 8/9, 2020 -- and it will remain the default -- for the vowel 'e' I used hexadecimal A9BC, Javanese vowel sign pepet. That is not really correct, as it seems to indicate a shwa-like vowel in Javanese, which in Interlingua should not be sounded (although it sometimes is). Also, for the vowel o, I use code A9B5, Javanese vowel sign tolong. However, that seems to be a Sundanese practice, says https://en.wikipedia.org/wiki/Javanese_script#Sundanese_language I am not sure if that is correct, because https://en.wikipedia.org/wiki/Sundanese_language does not mention the use of the Javanese script for the Sundanese language, and Sundanese has its own script, which not only looks quite different, but is also organised very differently as regards the code page: https://www.unicode.org/charts/PDF/U1B80.pdf Anyway, it seems in real Javanese the vowels e and o (two of each, but not distinguished in the script?) are indicated in a different way: 'e' is A9BA Javanese vowel sign taling (note that the code is after the consonant, but the sign appears before it!). The vowel 'o' is that same A9BA, but followed by an A9B4, vowel sign tarung. This otherwise means long a, for Sanskrit etc., but in Javanese, it does not occur as such. In my conversion (if without pangkon), I use it for every 'vowel a' after a consonant, though. Note that for example 'no' is encoded as nea = A9A4-A9BA-A9B4, but again, the font software renders it such that the signs of A9BA, then A9A4, then A9B4 appear, so more like ena = no. On 9 April 2020, I added the option --taling or -t to use that style, instead of the default e = pepet and o = tolong. By the way, could it be that these 'double' vowel diacritics for 'o' have their roots in Devanagari code 094E? "Devanagari vowel sign prishthamatra E; character has historic use only; combines with E to form AI, with AA to form O, and with O to form AU". See https://unicode.org/charts/PDF/U0900.pdf . VOWELS - INDEPENDENT SYMBOLS Note also that native Javanese in Javanese script does not use the independent vowel signs (swara) for word-initial vowels. Instead, it uses the consonant h as a carrier, and adds the vowel diacritic (dependent vowel sign -- sandhangan swara) to that. This is possible because from texts I saw (like in the Javanese Wikipedia), it seems the letter h never occurs initially. The only h's there are, are either final (indicated by sesigeg wignyan = visarga), or part of the digraphs th and dh which denote retroflex plosives. In Interlingua however (and many other European languages) I cannot use h like that, because initial h AND word-initial vowels occur, and are abundant. Just take the frequent words 'a' and 'ha' as examples. So I do use independent vowel signs (swara) for initial vowels, (like in Sundanese, it seems) and also for anything else that isn't after a consonant. (So it's always after another vowel? I think so.) MAKING IT QUIETER? (12/13 April 2020) Possible optional enhancements, not (yet?) implemented): 1. Swap v and w, so put the three dots (cecak telu) on the w, not the v, so w becomes a rekan. The letter w is quite infrequent in Interlingua, but v is frequent, so this creates a quieter and more uniform looking text. Cecak telu does look good, except in combination with other diacritics, such as wulu (i) and pepet (shwa, e)(if used). 2. It would be possible to avoid using the independent vowel signs (which are also visually unquiet, and in the case of vowel a, not beautiful, to my taste), by doing the same as what is done in real Javanese: use h as a carrier consonant. Then a separate sign would be needed for when h in Interlingua actually occurs. That could be h with cecak telu, which is an existing sign for the Arabic "hard h" in loanwords (although in Interlingua the letter h does not sound like that). Measures 1. and 2. above would be contradictory, of course, in that one removes a frequent cecak telu, while the other introduces a new one. But h in Interlingua is not very frequent, much less so than in English, for example. 13 April 2020: A compromise, or mixed solution, complicated but hopefully beautiful (option --mixed, -m): - Swap v and w, as above. - Interlingua h becomes h cecak telu. - Word-initial vowel: for 'a' use 'ha', for other vowels: use independent vowel sign. - Vowel that follows a vowel: always use h as a carrier, never use independent vowel signs. - After a consonant: - For vowel 'a' use tarung. - For vowel 'e' use taling, not pepet. - For vowel 'o' use tolong (A9B5), not taling-tarung. */