/* Le 23 usque . Converter un texto in interlingua
del scriptura latin al scriptura georgian.
Autor: Ruud Harmsen,
https://rudhar.com/lingtics/intrlnga/scrptura/
Demo: https://rudhar.com/religion/soantaka.htm
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "utftools.h"
long conv_table[] =
{
/* See:
https://en.wikipedia.org/wiki/Georgian_language#Phonology
https://en.wikipedia.org/wiki/Georgian_scripts
http://unicode.org/charts/PDF/U10A0.pdf */
/* a */ 0x10D0,
/* b */ 0x10D1,
/* c */ 0x10EC, /* Note 1, see comments below */
/* d */ 0x10D3,
/* e */ 0x10D4,
/* f */ 0x10E4, /* Note 2, see comments below */
/* g */ 0x10D2,
/* h */ 0x10F0,
/* i */ 0x10D8,
/* j */ 0x10DF, /* Based on sound of j in French Jean */
/* k */ 0x10D9, /* Note 1 */
/* l */ 0x10DA,
/* m */ 0x10DB,
/* n */ 0x10DC,
/* o */ 0x10DD,
/* p */ 0x10DE, /* Note 1 */
/* q */ 0x10E7,
/* r */ 0x10E0,
/* s */ 0x10E1,
/* t */ 0x10E2, /* Note 1 */
/* u */ 0x10E3,
/* v */ 0x10D5,
/* w */ 0x10F3, /* Wikipedia: U+10F3 vie [...] Svan /w/,
Unicode.org: 10F3 ჳ GEORGIAN LETTER WE
For Georgian itself, this is an archaic letter. */
/* x */ 0x10EE, /* Based on sound as in German Bach, not
its unusual sound ks in Interlingua */
/* y */ 0x10F2, /* Wikipedia: ჲ (hie), also called yota,[59] appeared instead [...] */
/* z */ 0x10D6,
/* safety stop */ -1,
};
static int convert (int c, FILE *fpi, FILE *fpo,
long offset_lowercase, long offset_uppercase);
static void interpret_cmdline_options (int argc, char **argv,
long *p_offset_lowercase, long *p_offset_uppercase);
int main (int argc, char **argv)
{
long offset_lowercase = 0;
long offset_uppercase = 0;
FILE *fpi = stdin, *fpo = stdout;
int c;
int intag = 0, inentity = 0;
interpret_cmdline_options(argc, argv, &offset_lowercase, &offset_uppercase);
while ((c = getc(fpi)) != EOF)
{
if (!intag && c == '<')
intag = 1;
else if (intag && c == '>')
intag = 0;
else if (!inentity && c == '&')
inentity = 1;
else if (inentity && c == ';')
inentity = 0;
if (intag || inentity)
putc(c, fpo);
else
convert(c, fpi, fpo, offset_lowercase, offset_uppercase);
}
return 0;
}
static int convert (int c, FILE *fpi, FILE *fpo,
long offset_lowercase, long offset_uppercase)
{
long offset = 0;
if (!isascii(c) || !isalpha(c))
{
putc(c, fpo);
}
else
{
int tabval, index, less_for_uppercase = 0;
if (isupper(c))
{
index = c - 'A';
offset = offset_uppercase;
}
else
{
index = c - 'a';
offset = offset_lowercase;
}
/* Safety first */
if (index >= 26)
index = 26;
tabval = conv_table[index];
if (tabval < 0)
{
putc(c, fpo);
}
else
{
fprintf(fpo, "%s", scalar2utf8(tabval + offset));
}
}
return 0;
}
static void interpret_cmdline_options (int argc, char **argv,
long *p_offset_lowercase, long *p_offset_uppercase)
{
int asomtavruli = 0;
int nuskhuri = 0;
int mtavruli = 0;
int hybrid = 0;
while (argc > 1 && argv[1][0] == '-')
{
if (!strcmp(argv[1], "--asomtavruli") ||
!strcmp(argv[1], "-a"))
asomtavruli = 1;
else if (!strcmp(argv[1], "--nuskhuri") ||
!strcmp(argv[1], "-n"))
nuskhuri = 1;
else if (!strcmp(argv[1], "--mtavruli") ||
!strcmp(argv[1], "-m"))
mtavruli = 1;
else if (!strcmp(argv[1], "--hybrid") ||
!strcmp(argv[1], "-h"))
hybrid = 1;
else
{
char *cp;
for (cp = argv[1] + 1; *cp; cp++)
{
switch (*cp)
{
case 'a': asomtavruli = 1; break;
case 'n': nuskhuri = 1; break;
case 'm': mtavruli = 1; break;
case 'h': hybrid = 1; break;
}
}
}
argc--, argv++;
}
/* - Unicode starting point: 10D0 Mkhedruli
- Unicode starting point: 10A0 Asomtavruli
- Unicode starting point: 2D00 Nuskhuri
- Unicode starting point: 1C90 Mtavruli */
if (asomtavruli && nuskhuri)
{
*p_offset_uppercase = 0x10A0 - 0x10D0;
*p_offset_lowercase = 0x2D00 - 0x10D0;
}
else if (asomtavruli)
{
*p_offset_uppercase = 0x10A0 - 0x10D0;
*p_offset_lowercase = 0x10A0 - 0x10D0;
}
else if (nuskhuri)
{
*p_offset_uppercase = 0x2D00 - 0x10D0;
*p_offset_lowercase = 0x2D00 - 0x10D0;
}
else if (mtavruli)
{
*p_offset_uppercase = 0x1C90 - 0x10D0;
*p_offset_lowercase = 0x10D0 - 0x10D0;
}
else if (hybrid)
{
*p_offset_uppercase = 0x10A0 - 0x10D0;
*p_offset_lowercase = 0x10D0 - 0x10D0;
}
}
/* =========================================================
Design considerations:
See:
https://en.wikipedia.org/wiki/Georgian_language#Phonology
https://en.wikipedia.org/wiki/Georgian_scripts
http://unicode.org/charts/PDF/U10A0.pdf
There are also the Unicode pages
http://unicode.org/charts/PDF/U10A0.pdf and
http://unicode.org/charts/PDF/U2D00.pdf
which are for characters used in a religious context, but
I don't use any of those.
Note 1:
For p, t and k of Interlingua, it's possible to use the
aspirated letters/sounds of the Georgian language, or
the glottalised/ejective ones. Wikipedia says that
"The glottalization of the ejectives is rather light, ..."
and also considering that Georgian has no /f/, the letter
for aspirated p would be a candidate for f in the Latin
script normally used for Interlingua.
Considering that, I decided to use the ejective letters
p, t, k, and also ejective ts for the letter c, even if
of course Interlingua has no ejective sounds, and c can
sound as ts and k. But I ignore phonetics there, and
transliterate just alphabetically.
Note 2:
For f, I could have use 10F6 ჶ GEORGIAN LETTER F, but it
doesn't have a corresponding (ecclesiastic) uppercase form.
That would be code 10C6, but nothing was defined for that.
Therefore I use the codes for the aspirated p as Interlingua's
f: 10E4.
Note 3:
The Georgian script in fact consists of four different kinds:
- Unicode starting point: 10D0
Mkhedruli, the normal script for everyday use. It rarely
uses uppercase. In this program, this is the default, so
'A' and 'a' are mapped to the same Georgian letter. This
means the conversion is non-reversible, of course.
- Unicode starting point: 10A0
The oldest alphabet, for ecclesiastical alphabet, uppercase
only. One style is called Asomtavruli.
- Unicode starting point: 2D00
Nuskhuri, a lowercase alphabet, for ecclesiastical use.
- Unicode starting point: 1C90
Mtavruli, an uppercase secular alphabet. Used for emphasis
and names, similar to small caps in Latin script. In past
sometimes also used in the same way as in the Latin, Greek
and Cyrillic script, i.e. at the start of a sentence, and
the start of a name.
At least on my system (Linux Mint 18.3), this uppercase
alphabet is not supported by system fonts.
Options in this program ina-Georg:
- no option, default value: Mkhedruli only.
- --asomtavruli or -a: Asomtavruli only.
- --nuskhuri or -n: Nuskhuri only.
- Those last two options combined (may also be written as
-n -a, -na, -an): Latin uppercase is mapped to Asomtavruli,
and lowercase to Nuskhuri
- --mtavruli or -m: Latin uppercase as Mtavruli, lowercase
as Mkhedruli.
- --hybrid or -h: Latin uppercase as Asomtavruli, lowercase
as Mkhedruli. This is never seen in the wild, but it is
nice to see how it looks, also because option -m causes
characters not supported by system fonts (at least on my
system as of 26 April 2020).
*/