Módulo:generar-pron/ru
Apariencia
La documentación para este módulo puede ser creada en Módulo:generar-pron/ru/doc
-- Tomado de en.wikt, implementado por Tmagc
local remove_grave_accents_from_phonetic_respelling = true -- Anatoli's desired value
local m_ru_translit = require("Módulo:translit/ru")
local export = {}
local insert = table.insert
local concat = table.concat
local remove = table.remove
local m_str = require("Módulo:String")
local u = m_str.char
local strfind = m_str.find
local strmatchit = m_str.gmatch
local strsubn = m_str.gsub
local strsubrep = m_str.gsub_rep
local strsplit = m_str.split
local strstrip = m_str.strip
local strlower = m_str.lower
local substr = m_str.sub
local strlen = m_str.len
local strnfd = m_str.toNFD
local strnfc = m_str.toNFC
local strhtml = m_str.encode_html
-- version of strsubn() that discards all but the first return value
local function strsub(term, foo, bar)
local retval = strsubn(term, foo, bar)
return retval
end
local function list_to_set(t)
local set = {}
for _, item in ipairs(t) do
set[item] = true
end
return set
end
local PUNTUACION = "[%(%)%[%]%{%}¡!¿?.,;:–—]"
local PUNTUACION_EXTRA = "[%(%)%[%]%{%}¡!¿?.,;:–—\"“”„‟‘’«»»«‹››‹'´]"
--ru.common
local AC = u(0x0301) -- acute = ́
local GR = u(0x0300) -- grave = ̀
local CFLEX = u(0x0302) -- circumflex = ̂
local BREVE = u(0x0306) -- breve ̆
local DIA = u(0x0308) -- diaeresis = ̈
local CARON = u(0x030C) -- caron ̌
local OGONEK = u(0x0328) -- ogonek ̨
local DUBGR = u(0x030F) -- double grave = ̏
local DOTABOVE = u(0x0307) -- dot above = ̇
local DOTBELOW = u(0x0323) -- dot below = ̣
local PSEUDOVOWEL = u(0xFFF1) -- pseudovowel placeholder
local PSEUDOCONS = u(0xFFF2) -- pseudoconsonant placeholder
local TEMPCFLEX = u(0xFFF3) -- placeholder to be converted to a circumflex
local TEMPSUB = u(0xFFF4) -- miscellaneous temporary placeholder
-- any accent
local accent = AC .. GR .. DIA .. BREVE .. CARON .. OGONEK
-- regex for any optional accent(s)
local opt_accent = "[" .. accent .. "]*"
-- any composed Cyrillic vowel with grave accent
local composed_grave_vowel = "ѐЀѝЍ"
-- any Cyrillic vowel except ёЁ
local vowel_no_jo = "аеиоуяэыюіѣѵАЕИОУЯЭЫЮІѢѴ" .. PSEUDOVOWEL .. composed_grave_vowel
-- any Cyrillic vowel, including ёЁ
local vowel = vowel_no_jo .. "ёЁ"
-- any vowel in transliteration
local tr_vowel = "aeěɛiouyAEĚƐIOUY" .. PSEUDOVOWEL
-- any consonant in transliteration, omitting soft/hard sign
local tr_cons_no_sign = "bcčdfghjklmnpqrsštvwxzžBCČDFGHJKLMNPQRSŠTVWXZŽ" .. PSEUDOCONS
-- any consonant in transliteration, including soft/hard sign
local tr_cons = tr_cons_no_sign .. "ʹʺ"
-- regex for any consonant in transliteration, including soft/hard sign,
-- optionally followed by any accent
local tr_cons_acc_re = "[" .. tr_cons .. "]" .. opt_accent
-- any Cyrillic consonant except sibilants and ц
local cons_except_sib_c = "бдфгйклмнпрствхзьъБДФГЙКЛМНПРСТВХЗЬЪ" .. PSEUDOCONS
-- Cyrillic sibilant consonants
local sib = "шщчжШЩЧЖ"
-- Cyrillic sibilant consonants and ц
local sib_c = sib .. "цЦ"
-- any Cyrillic consonant
local cons = cons_except_sib_c .. sib_c
-- Cyrillic velar consonants
local velar = "кгхКГХ"
-- uppercase Cyrillic consonants
local uppercase = "АЕИОУЯЭЫЁЮІѢѴБДФГЙКЛМНПРСТВХЗЬЪШЩЧЖЦ"
local recomposer = {
-- Cyrillic letters
["е" .. DIA] = "ё",
["Е" .. DIA] = "Ё",
["и" .. BREVE] = "й",
["И" .. BREVE] = "Й",
["і" .. DIA] = "ї",
["І" .. DIA] = "Ї",
-- Latin letters
["c" .. CARON] = "č",
["C" .. CARON] = "Č",
["e" .. CARON] = "ě",
["E" .. CARON] = "Ě",
["o" .. CARON] = "ǒ",
["O" .. CARON] = "Ǒ",
["o" .. OGONEK] = "ǫ",
["O" .. OGONEK] = "Ǫ",
["s" .. CARON] = "š",
["S" .. CARON] = "Š",
["z" .. CARON] = "ž",
["Z" .. CARON] = "Ž",
-- used in ru-pron:
["ж" .. BREVE] = "ӂ", -- used in ru-pron
["Ж" .. BREVE] = "Ӂ",
["j" .. CFLEX] = "ĵ",
["J" .. CFLEX] = "Ĵ",
["j" .. CARON] = "ǰ",
-- no composed uppercase equivalent of J-caron
["ʒ" .. CARON] = "ǯ",
["Ʒ" .. CARON] = "Ǯ",
}
-- Decompose acute, grave, etc. on letters (esp. Latin) into individivual
-- character + combining accent. But recompose Cyrillic and Latin characters
-- that we want to treat as units and get caught in the crossfire. We mostly
-- want acute and grave decomposed; perhaps should just explicitly decompose
-- those and no others.
local function decompose(text)
text = strnfd(text)
text = strsub(text, ".[" .. BREVE .. DIA .. CARON .. OGONEK .. "]", recomposer)
return text
end
local grave_deaccenter = {
[GR] = "", -- grave accent
["ѐ"] = "е", -- composed Cyrillic chars w/grave accent
["Ѐ"] = "Е",
["ѝ"] = "и",
["Ѝ"] = "И",
}
-- Remove grave accents; don't affect acute or composed diaeresis in ёЁ or
-- uncomposed diaeresis in -ѣ̈- (as in plural сѣ̈дла of сѣдло́).
-- NOTE: Translit must already be decomposed! See comment at top.
local function remove_grave_accents(word, tr)
local ru_removed = strsubn(word, "[̀ѐЀѝЍ]", grave_deaccenter)
if not tr then
return ru_removed, nil
end
return ru_removed, (strsubn(tr, GR, ""))
end
-- local test_new_ru_pron_module = false
-- If enabled, do new code for final -е; else, the old way
local new_final_e_code = true
-- If enabled, do special case for final -е not before a pause
local final_e_non_pausal = false
local vow = 'aeiouyɛəäạëöü'
local ipa_vow = vow .. 'ɐɪʊɨæɵʉ'
local vowels, vowels_c = '[' .. vow .. ']', '([' .. vow .. '])'
-- No need to include DUBGR here because we rewrite it to CFLEX very early
local acc = AC .. GR .. CFLEX .. DOTABOVE .. DOTBELOW
local accents = '[' .. acc .. ']'
local stress_accents = '[' .. AC .. GR .. ']'
local perm_syl_onset = list_to_set({
'spr', 'str', 'skr', 'spl', 'skl',
-- FIXME, do we want sc?
'sp', 'st', 'sk', 'sf', 'sx', 'sc',
'pr', 'br', 'tr', 'dr', 'kr', 'gr', 'fr', 'vr', 'xr',
'pl', 'bl', 'kl', 'gl', 'fl', 'vl', 'xl',
-- FIXME, do we want the following? If so, do we want vn?
'ml', 'mn',
-- FIXME, dž is now converted to ĝž, which will have a syllable
-- boundary in between
'šč', 'dž',
})
-- FIXME: Consider changing ӂ internally to ʑ to match ɕ (it is used externally
-- in e.g. дроӂӂи (pronunciation spelling of дрожжи)
local translit_conv = {
['c'] = 't͡s', ['č'] = 't͡ɕ', ['ĉ'] = 't͡ʂ',
['g'] = 'ɡ', ['ĝ'] = 'd͡ʐ',
['ĵ'] = 'd͡z', ['ǰ'] = 'd͡ʑ', ['ӂ'] = 'ʑ',
['š'] = 'ʂ', ['ž'] = 'ʐ'
}
local translit_conv_j = {
['cʲ'] = 't͡sʲ',
['ĵʲ'] = 'd͡zʲ'
}
-- Table of allophones. Each entry is a list of three values:
-- (1) the stressed value; (2) the value immediately before primary or
-- secondary stress; (3) the value elsewhere.
local allophones = {
['a'] = { 'a', 'ɐ', 'ə' },
['e'] = { 'e', 'ɪ', 'ɪ' },
['i'] = { 'i', 'ɪ', 'ɪ' },
['o'] = { 'o', 'ɐ', 'ə' },
['u'] = { 'u', 'ʊ', 'ʊ' },
['y'] = { 'ɨ', 'ɨ', 'ɨ' },
['ɛ'] = { 'ɛ', 'ɨ', 'ɨ' },
['ä'] = { 'a', 'ɪ', 'ɪ' },
['ạ'] = { 'a', 'ɐ', 'ə' },
['ë'] = { 'e', 'ɪ', 'ɪ' },
['ö'] = { 'ɵ', 'ɪ', 'ɪ' },
['ü'] = { 'u', 'ʊ', 'ʊ' },
['ə'] = { 'ə', 'ə', 'ə' },
}
local devoicing = {
['b'] = 'p', ['d'] = 't', ['g'] = 'k',
['z'] = 's', ['v'] = 'f',
['ž'] = 'š', ['ɣ'] = 'x',
['ĵ'] = 'c', ['ǰ'] = 'č', ['ĝ'] = 'ĉ',
['ӂ'] = 'ɕ',
}
local voicing = {
['p'] = 'b', ['t'] = 'd', ['k'] = 'g',
['s'] = 'z', ['f'] = 'v',
['š'] = 'ž', ['c'] = 'ĵ', ['č'] = 'ǰ', ['ĉ'] = 'ĝ',
['x'] = 'ɣ', ['ɕ'] = 'ӂ'
}
local iotating = {
['a'] = 'ä',
['e'] = 'ë',
['o'] = 'ö',
['u'] = 'ü'
}
local retracting = {
['e'] = 'ɛ',
['i'] = 'y',
}
local fronting = {
['a'] = 'æ',
['u'] = 'ʉ',
['ʊ'] = 'ʉ',
}
local pron_abc = {
["А"] = {"а"},
["а"] = {"а"},
["Б"] = {"бэ"},
["б"] = {"бэ"},
["В"] = {"вэ"},
["в"] = {"вэ"},
["Г"] = {"гэ"},
["г"] = {"гэ"},
["Д"] = {"дэ"},
["д"] = {"дэ"},
["Е"] = {"е"},
["е"] = {"е"},
["Ё"] = {"ё"},
["ё"] = {"ё"},
["Ж"] = {"жэ"},
["ж"] = {"жэ"},
["З"] = {"зэ"},
["з"] = {"зэ"},
["И"] = {"и"},
["и"] = {"и"},
["Й"] = {"и краткое"},
["й"] = {"и краткое"},
["К"] = {"ка"},
["к"] = {"ка"},
["Л"] = {"эл"},
["л"] = {"эл"},
["М"] = {"эм"},
["м"] = {"эм"},
["Н"] = {"эн"},
["н"] = {"эн"},
["О"] = {"о"},
["о"] = {"о"},
["П"] = {"пэ"},
["п"] = {"пэ"},
["Р"] = {"эр"},
["р"] = {"эр"},
["С"] = {"эс"},
["с"] = {"эс"},
["Т"] = {"тэ"},
["т"] = {"тэ"},
["У"] = {"у"},
["у"] = {"у"},
["Ф"] = {"эф"},
["ф"] = {"эф"},
["Х"] = {"ха"},
["х"] = {"ха"},
["Ц"] = {"це"},
["ц"] = {"це"},
["Ч"] = {"че"},
["ч"] = {"че"},
["Ш"] = {"ша"},
["ш"] = {"ша"},
["Щ"] = {"ща"},
["щ"] = {"ща"},
["Ъ"] = {"твёрдый знак"},
["ъ"] = {"твёрдый знак"},
["Ы"] = {"ы"},
["ы"] = {"ы"},
["Ь"] = {"мягкий знак"},
["ь"] = {"мягкий знак"},
["Э"] = {"э"},
["э"] = {"э"},
["Ю"] = {"ю"},
["ю"] = {"ю"},
["Я"] = {"я"},
["я"] = {"я"},
-- letras viejas
["І"] = {"і десятеричное"},
["і"] = {"і десятеричное"},
["Ѣ"] = {"ять"},
["ѣ"] = {"ять"},
["Ѳ"] = {"ѳита"},
["ѳ"] = {"ѳита"},
["Ѵ"] = {"ижица"},
["ѵ"] = {"ижица"},
["Ѕ"] = {"ѕѣлѡ"},
["ѕ"] = {"ѕѣлѡ"},
["Ѯ"] = {"ѯи"},
["ѯ"] = {"ѯи"},
["Ѱ"] = {"ѱи"},
["ѱ"] = {"ѱи"},
["Ѡ"] = {"ѡмега"},
["ѡ"] = {"ѡмега"},
["Ѫ"] = {"юсъ большой"},
["ѫ"] = {"юсъ большой"},
["Ѧ"] = {"юсъ малый"},
["ѧ"] = {"юсъ малый"},
["Ѭ"] = {"юсъ большой іотированный"},
["ѭ"] = {"юсъ большой іотированный"},
["Ѩ"] = {"юсъ малый іотированный"},
["ѩ"] = {"юсъ малый іотированный"}
}
-- Prefixes that we recognize specially when they end in a geminated
-- consonant. The first element is the result after applying voicing/devoicing,
-- gemination and other changes. The second element is the original spelling,
-- so that we don't overmatch and get cases like Поттер. We check for these
-- prefixes at the beginning of words and also preceded by ne-, po- and nepo-.
-- The third element should be true if the prefix produces [žž] when assimilated
-- to a following ж, otherwise omitted. We use this as part of the
-- implementation of automatic ӂӂ pronunciation, which shouldn't happen at
-- prefix boundaries.
local geminate_pref = {
--'abː', --'adː',
{'be[szšž]ː', 'be[sz]', true},
--'braomː',
{'[vf]ː', 'v'},
{'vo[szšž]ː', 'vo[sz]', true},
{'i[szšž]ː', 'i[sz]', true},
--'^inː',
{'kontrː', 'kontr'},
{'superː', 'super'},
{'tran[szšž]ː', 'trans', true},
{'na[tdcč]ː', 'nad'},
{'ni[szšž]ː', 'ni[sz]', true},
{'o[tdcč]ː', 'ot'}, --'^omː',
{'o[bp]ː', 'ob'},
{'obe[szšž]ː', 'obe[sz]', true},
{'po[tdcč]ː', 'pod'},
{'pre[tdcč]ː', 'pred'}, --'^paszː', '^pozː',
{'ra[szšž]ː', 'ra[sz]', true},
{'[szšž]ː', '[sz]', true},
{'me[žš]ː', 'mež', true},
{'če?re[szšž]ː', 'če?re[sz]', true},
-- certain double prefixes involving ra[zs]-
{'predra[szšž]ː', 'predra[sz]', true},
{'bezra[szšž]ː', 'bezra[sz]', true},
{'nara[szšž]ː', 'nara[sz]', true},
{'vra[szšž]ː', 'vra[sz]', true},
{'dora[szšž]ː', 'dora[sz]', true},
-- '^sverxː', '^subː', '^tröxː', '^četyröxː',
}
local sztab = { s='cs', z='ĵz' }
local function ot_pod_sz(pre, sz)
return pre .. sztab[sz]
end
-- Ad-hoc phonetic substitutions to apply. Each entry is a two-element list,
-- the two arguments to 'strsub()'. These are applied in order, and are
-- carefully ordered to work correctly; don't reorder them unless you know
-- what you're doing. This is called fairly early on, after transliterating,
-- splitting on words, adding ⁀ at the beginning and end of all words, and
-- applying a few other changes. It mostly implements various sorts of
-- assimilations.
local phonetic_subs = {
{'h', 'ɣ'},
{'šč', 'ɕː'}, -- conversion of šč to geminate
-- the following group is ordered before changes that affect ts
{'n[dt]sk', 'n(t)sk'},
{'s[dt]sk', 'sck'},
-- -дцат- (in numerals) has optionally-geminated дц; if unstressed,
-- pronounced as -дцыт-
{'dca(' .. accents .. '?)t', function(accent)
if accent == '' then
return 'c(c)yt'
else
return 'c(c)a' .. accent .. 't'
end
end
},
-- Add / before цз, чж sequences (Chinese words) and assimilate чж
{'cz', '/cz'},
{'čž', '/ĝž'},
-- main changes for affricate assimilation of [dt] + sibilant, including ts;
-- we either convert to "short" variants t͡s, d͡z, etc. or to "long" variants
-- t͡ss, d͡zz, etc.
-- 1. т с, д з across word boundary, also т/с, д/з with explicitly written
-- slash, use long variants.
{'[dt](ʹ?[ ‿⁀/]+)s', 'c%1s'},
{'[dt](ʹ?[ ‿⁀/]+)z', 'ĵ%1z'},
-- 2. тс, дз + vowel use long variants.
{'[dt](ʹ?)s(j?' .. vowels .. ')', 'c%1s%2'},
{'[dt](ʹ?)z(j?' .. vowels .. ')', 'ĵ%1z%2'},
-- 3. тьс, дьз use long variants.
{'[dt]ʹs', 'cʹs'},
{'[dt]ʹz', 'ĵʹz'},
-- 4. word-initial от[сз]-, под[сз]- use long variants because there is
-- a morpheme boundary.
{'(⁀o' .. accents .. '?)t([sz])', ot_pod_sz},
{'(⁀po' .. accents .. '?)d([sz])', ot_pod_sz},
-- 5. other тс, дз use short variants.
{'[dt]s', 'c'},
{'[dt]z', 'ĵ'},
-- 6. тш, дж always use long variants (FIXME, may change)
{'[dtč](ʹ?[ %-‿⁀/]*)š', 'ĉ%1š'},
{'[dtč](ʹ?[ %-‿⁀/]*)ž', 'ĝ%1ž'},
-- 7. soften palatalized hard hushing affricates resulting from the previous
{'ĉʹ', 'č'},
{'ĝʹ', 'ǰ'},
-- changes that generate ɕː and ɕč through assimilation:
-- зч and жч become ɕː, as does сч at the beginning of a word and in the
-- sequence счёт when not following [цдт] (подсчёт); else сч becomes ɕč
-- (отсчи́тываться), as щч always does (рассчитáть written ращчита́ть)
{'[cdt]sč', 'čɕː'},
{'ɕːč', 'ɕč'},
{'[zž]č', 'ɕː'},
{'[szšž]ɕː?', 'ɕː'},
{'⁀sč', '⁀ɕː'},
{'sč(j?[oi]' .. accents .. '?)t', 'ɕː%1t'},
{'sč', 'ɕč'},
-- misc. changes for assimilation of [dtsz] + sibilants and affricates
{'[sz][dt]c', 'sc'},
{'([rn])[dt]([cč])', '%1%2'},
-- дц, тц, дч, тч + vowel always remain geminated, so mark this with ˑ;
-- if not followed by a vowel, as in e.g. путч, use normal gemination
-- (it will normally be degeminated)
{'[dt]([cč])(' .. vowels .. ')', '%1ˑ%2'},
{'[dt]([cč])', '%1%1'},
-- the following is ordered before the next one, which applies assimilation
-- of [тд] to щ (including across word boundaries)
{'n[dt]ɕ', 'nɕ'},
-- [сз] and [сз]ь before soft affricates [щч], including across word
-- boundaries; note that the common sequence сч has already been handled
{'[zs]ʹ?([ ‿⁀/]*[ɕč])', 'ɕ%1'},
-- reduction of too many ɕ's, which can happen from the previous
{'ɕɕː', 'ɕː'},
-- assimilation before [тдц] and [тдц]ь before щ
{'[cdt]ʹ?([ ‿⁀/]*)ɕ', 'č%1ɕ'},
-- assimilation of [сз] and [сз]ь before [шж]
{'[zs]([ ‿⁀/]*)š', 'š%1š'},
{'[zs]([ ‿⁀/]*)ž', 'ž%1ž'},
{'[zs]ʹ([ ‿⁀/]*)š', 'ɕ%1š'},
{'[zs]ʹ([ ‿⁀/]*)ž', 'ӂ%1ž'},
-- assimilation of [сз]ь before с[еияёю] (in imperatives esp. before ся)
{'[zs]ʹs([eij])', 'sˑ%1'},
-- assimilation of [тд]ь before т[еияёю] (e.g. in imperatives esp. before те)
{'[td]ʹt([eij])', 'tˑ%1'},
-- optional palatalization of palatalized labials before another consonant
-- in [ст][еияёю] (esp. in imperatives before -те, -ся)
-- FIXME, perhaps we should either generalize this or restrict it only
-- to imperatives
{'([mpbfv])ʹ([st][eij])', '%1(ʹ)%2'},
{'sverxi', 'sverxy'},
{'stʹd', 'zd'},
-- this will often become degeminated
{'tʹd', 'dd'},
-- loss of consonants in certain clusters
{'([ns])[dt]g', '%1g'},
{'zdn', 'zn'},
{'lnc', 'nc'},
{'[sz]t(li' .. accents .. '?v)', 's%1'},
{'[sz]tn', 'sn'},
{'lvstv', 'lstv'},
-- initial unstressed э -> и; should precede backing of /i/ in close juncture
{'⁀ɛ([^' .. acc .. '])', '⁀i%1'},
-- unstressed э after a vowel -> и; repeated to handle the unlikely case
-- where two ээ occur in a row; FIXME, this is a type of ikanye, and we
-- mostly implement ikanye later on using the chart in 'allophones', so
-- it would be nice to merge these two cases, but I can't think of an
-- obvious way to do it
{'(' .. vowels .. accents .. '?)ɛ([^' .. acc .. '])', '%1i%2'},
{'(' .. vowels .. accents .. '?)ɛ([^' .. acc .. '])', '%1i%2'},
-- backing of /i/ after hard consonants in close juncture
{'([mnpbtdkgfvszxɣrlšžcĵĉĝ])⁀‿⁀i', '%1⁀‿⁀y'},
}
local cons_assim_palatal = {
-- assimilation of tn, dn, sn, zn, st, zd, nč, nɕ is handled specially
compulsory = list_to_set({'ntʲ', 'ndʲ', 'xkʲ',
'csʲ', 'ĵzʲ', 'ncʲ', 'nĵʲ'}),
optional = list_to_set({'slʲ', 'zlʲ', 'nsʲ', 'nzʲ',
'mpʲ', 'mbʲ', 'mfʲ', 'fmʲ'})
}
-- words which will be treated as accentless (i.e. their vowels will be
-- reduced), and which will liaise with a preceding or following word;
-- this will not happen if the words have an accent mark, cf.
-- по́ небу vs. по не́бу, etc.
local accentless = {
-- class 'pre': particles that join with a following word
pre = list_to_set({'bez', 'bliz', 'v', 'vo', 'da', 'do',
'za', 'iz', 'iz-pod', 'iz-za', 'izo', 'k', 'ko', 'mež',
'na', 'nad', 'nado', 'ne', 'ni', 'ob', 'obo', 'ot', 'oto',
'pered', 'peredo', 'po', 'pod', 'podo', 'pred', 'predo', 'pri', 'pro',
's', 'so', 'u', 'čerez'}),
-- class 'prespace': particles that join with a following word, but only
-- if a space (not a hyphen) separates them; hyphens are used here
-- to spell out letters, e.g. а-эн-бэ́ for АНБ (NSA = National Security
-- Agency) or о-а-э́ for ОАЭ (UAE = United Arab Emirates)
prespace = list_to_set({'a', 'o'}),
-- class 'post': particles that join with a preceding word
post = list_to_set({'by', 'b', 'ž', 'že', 'li', 'libo', 'lʹ', 'ka',
'nibudʹ', 'tka'}),
-- class 'posthyphen': particles that join with a preceding word, but only
-- if a hyphen (not a space) separates them
posthyphen = list_to_set({'to'}),
}
-- Pronunciation of final unstressed -е, depending on the part of speech and
-- exact ending. Also used for pronunciation of -ться in imperatives vs.
-- infinitives.
--
-- Endings:
-- oe = -ое
-- ve = any other vowel plus -е (FIXME, may have to split out -ее)
-- je = -ье
-- softpaired = soft paired consonant + -е
-- hardsib = hard sibilant (ц, ш, ж) + -е
-- softsib = soft sibilant (ч, щ) + -е
--
-- Parts of speech:
-- def = default used in absence of pos
-- n/noun = neuter noun in the nominative/accusative singular (but not ending
-- in adjectival -ое or -ее; those should be considered as adjectives)
-- pre = prepositional case singular
-- dat = dative case singular (treated same as prepositional case singular)
-- voc = vocative case (currently treated as 'mid')
-- nnp = noun nominative plural in -е (гра́ждане, боя́ре, армя́не); not
-- adjectival plurals in -ие or -ые, including adjectival nouns
-- (да́нные, а́вторские)
-- inv = invariable noun or other word (currently treated as 'mid')
-- a/adj = adjective or adjectival noun (typically either neuter in -ое or
-- -ее, or plural in -ие, -ые, or -ье, or short neuter in unpaired
-- sibilant + -е)
-- c/com = comparative (typically either in -ее or sibilant + -е)
-- adv = adverb
-- p = preposition (treated same as adverb)
-- v/vb/verb = finite verbal form (usually 2nd-plural in -те), but not
-- imperatives (use pos=imp) and not participle forms, which should be
-- treated as adjectives
-- pro = pronoun (кое-, какие-, ваше, сколькие)
-- num = number (двое, трое, обе, четыре; currently treated as 'mid')
-- pref = prefix (treated as 'high' because integral part of word)
-- hi/high = force high values ([ɪ] or [ɨ])
-- mid = force mid values ([e] or [ɨ])
-- lo/low/schwa = force low, really schwa, values ([ə])
--
-- Possible values:
-- 1. ə [ə], e [e], i [ɪ] after a vowel or soft consonant
-- 2. ə [ə] or y [ɨ] after a hard sibilant
--
-- If a part of speech doesn't have an entry for a given type of ending,
-- it receives the default value. If a part of speech's entry is a string,
-- it's an alias for another way of specifying the same part of speech
-- (e.g. n=noun).
local pos_properties = {
def={oe='ə', ve='e', je='e', softpaired='e', hardsib='y', softsib='e', tsjapal='n'},
noun={oe='ə', ve='e', je='e', softpaired='e', hardsib='ə', softsib='e'},
n='noun',
s='noun',
sust='noun',
pre={oe='e', ve='e', softpaired='e', hardsib='y', softsib='e'},
prep='pre',
dat='pre',
voc='mid',
nnp={softpaired='e'}, -- FIXME, not sure about this
snp='nnp',
inv='mid', --FIXME, not sure about this (e.g. вице-, кофе)
adj={oe='ə', ve='e', je='ə'}, -- FIXME: Not sure about -ее, e.g. neut adj си́нее; FIXME, not sure about short neuter adj, e.g. похо́же from похо́жий, дорогосто́яще from дорогосто́ящий, should this be treated as neuter noun?
a='adj',
com={ve='e', hardsib='y', softsib='e'},
c='com',
adv={softpaired='e', hardsib='y', softsib='e'},
p='adv', --FIXME, not sure about prepositions
verb={softpaired='e'},
v='verb',
vb='verb',
-- Imperatives like other verbs except that final -ться is palatalized
imp={softpaired='e', tsjapal='y'},
impv='imp',
pro={oe='i', ve='i'}, --FIXME, not sure about ваше, сколькие, какие-, кое-
num='mid', --FIXME, not sure about обе
pref='high',
-- forced values
high={oe='i', ve='i', je='i', softpaired='i', hardsib='y', softsib='i'},
alto='high',
hi='high',
mid={oe='e', ve='e', je='e', softpaired='e', hardsib='y', softsib='e'},
med = 'mid',
medio = 'mid',
low={oe='ə', ve='ə', je='ə', softpaired='ə', hardsib='ə', softsib='ə'},
bajo = 'low',
lo='low',
schwa='low'
}
-- remove accents that we don't want to appear in the phonetic respelling --> NO SE PARA QUE SERVIA, ESTABA EN EL MAIN DE ANTES
local function phon_respelling(text, remove_grave)
text = strsub(text, '[' .. CFLEX .. DUBGR .. DOTABOVE .. DOTBELOW .. '‿]', '')
-- Remove grave accents from annotations but maybe not from phonetic respelling
if remove_grave then
text = remove_grave_accents(text)
end
return text
end
-- Convert normalized spelling into actual pronunciation. Return value is a
-- list of one or more valid pronunciations. "Normalized" means that various
-- normalization transformations have been applied, e.g.
-- (1) text is transliterated and accents decomposed;
-- (2) ‿ is added where appropriate to join clitics to normally-stressed words;
-- (3) ⁀ is added at the beginning and end of all words;
-- (4) primary or tertiary stress may have been added to single-syllable words
-- as appropriate;
-- (5) punctuation is removed and replaced with spaces and/or IPA foot
-- boundaries;
-- (6) etc.
-- Note that normalization does *not* implement assimilations, conversion of
-- vowels or consonants to their IPA equivalents, or other intra-word changes.
local function ru_ipa_main(text, adj, gem, pos)
-- save original word spelling before respellings, (de)voicing changes,
-- geminate changes, etc. for implementation of geminate_pref
local orig_word = strsplit(text, " ", true)
local word
-- remove any apostrophes, since any still present at this stage
-- are purely cosmetic (e.g. in foreign names)
-- any apostrophes in the input that are standing in for hard signs
-- should have already been dealt with by the transliteration
-- module
text = strsub(text, '[\'’]', '')
-- insert or remove /j/ before [aou] so that palatal versions of these
-- vowels are always preceded by /j/ and non-palatal versions never are
-- (do this before the change below adding tertiary stress to final
-- palatal о):
-- (1) Non-palatal [ou] after always-hard шж (e.g. in брошю́ра, жю́ри)
-- despite the spelling (FIXME, should this also affect [a]?)
text = strsub(text, '([šž])j([ou])', '%2%3')
-- (2) Palatal [aou] after always-soft щчӂ and voiced variant ǰ (NOTE:
-- this happens before the change šč -> ɕː in phonetic_subs)
text = strsub(text, '([čǰӂ])([aou])', '%1j%2')
-- (3) ьо is pronounced as ьйо, i.e. like (possibly unstressed) ьё, e.g.
-- in Асунсьо́н
text = strsub(text, 'ʹo', 'ʹjo')
-- add tertiary stress to some final -о (this needs to be done before
-- eliminating dot-above, after adding ⁀, after adding /j/ before palatal о):
-- (1) after vowels, e.g. То́кио
text = strsub(text, '(' .. vowels .. accents .. '?o)⁀', '%1' .. CFLEX .. '⁀')
-- (2) when palatal, e.g. ра́нчо, га́учо, ма́чо, Ога́йо
text = strsub(text, 'jo⁀', 'jo' .. CFLEX .. '⁀')
-- eliminate dot-above, which has served its purpose of preventing any
-- sort of stress (needs to be done after adding tertiary stress to
-- final -о)
text = strsub(text, DOTABOVE, '')
-- eliminate dot-below (needs to be done after changes above that insert
-- j before [aou] after always-soft щчӂ)
text = strsub(text, 'ja' .. DOTBELOW, 'jạ')
if strfind(text, DOTBELOW) then
error("Dot-below accent can only be placed on я or palatal а")
end
text = adj and strsub(text, '(.[aoe]́?)go(' .. AC .. '?)⁀', '%1vo%2⁀') or text
text = adj and strsub(text, '(.[aoe]́?)go(' .. AC .. '?)sja⁀', '%1vo%2sja⁀') or text
local function fetch_pos_property(i, ending)
local thispos = pos[i] or 'def'
local chart = pos_properties[thispos]
while type(chart) == "string" do -- handle aliases
chart = pos_properties[chart]
end
assert(type(chart) == "table")
local sb = chart[ending] or pos_properties['def'][ending]
assert(sb)
return sb
end
-- Pos-specific handling of final -ться: palatalized if pos=imp, else not
-- (infinitives). If we have multiple parts of speech, we need to be
-- trickier, splitting by word.
local function final_tsja_processing(pron, i)
local tsjapal = fetch_pos_property(i, 'tsjapal')
if tsjapal == 'n' then
-- FIXME!!! Should these also pay attention to grave accents?
pron = strsub(pron, '́tʹ?sja⁀', '́cca⁀')
pron = strsub(pron, '([^́])tʹ?sja⁀', '%1ca⁀')
end
return pron
end
--split by word and process each word
word = strsplit(text, " ", true)
for i = 1, #word do
word[i] = final_tsja_processing(word[i], i)
end
text = concat(word, " ")
--phonetic substitutions of various sorts
for _, phonsub in ipairs(phonetic_subs) do
text = strsub(text, phonsub[1], phonsub[2])
end
--voicing, devoicing
--NOTE: v before an obstruent assimilates in voicing and triggers voicing
--assimilation of a preceding consonant; neither happens before a sonorant
--1. absolutely final devoicing
text = strsub(text, '([bdgvɣzžĝĵǰӂ])(ʹ?⁀)$', function(a, b)
return devoicing[a] .. b end)
--2. word-final devoicing before another word
text = strsub(text, '([bdgvɣzžĝĵǰӂ])(ʹ?⁀ ⁀[^bdgɣzžĝĵǰӂ])', function(a, b)
return devoicing[a] .. b end)
--3. voicing/devoicing assimilation; repeat to handle recursive assimilation
while true do
local new_text = strsub(text, '([bdgvɣzžĝĵǰӂ])([ ‿⁀ʹːˑ()/]*[ptkfxsščɕcĉ])', function(a, b)
return devoicing[a] .. b end)
new_text = strsub(new_text, '([ptkfxsščɕcĉ])([ ‿⁀ʹːˑ()/]*v?[ ‿⁀ʹːˑ()/]*[bdgɣzžĝĵǰӂ])', function(a, b)
return voicing[a] .. b end)
if new_text == text then
break
end
text = new_text
end
--re-notate orthographic geminate consonants
text = strsub(text, '([^' .. vow .. '.%-_])' .. '%1', '%1ː')
text = strsub(text, '([^' .. vow .. '.%-_])' .. '%(%1%)', '%1(ː)')
--rewrite iotated vowels
text = strsub(text, '(j[%(ːˑ%)]*)([aeou])', function(a, b)
return a .. iotating[b] end)
-- eliminate j after consonant and before iotated vowel (including
-- semi-reduced ạ)
text = strsub(text, '([^' .. vow .. acc .. 'ʹʺ‿⁀ ]/?)j([äạëöü])', '%1%2')
--split by word and process each word
word = strsplit(text, " ", true)
for i = 1, #word do
local pron = word[i]
-- Check for gemination at prefix boundaries; if so, convert the
-- regular gemination symbol ː to a special symbol ˑ that indicates
-- we always preserve the gemination unless gem=n. We look for
-- certain sequences at the beginning of a word, but make sure that
-- the original spelling is appropriate as well (see comment above
-- for geminate_pref).
if strfind(pron, 'ː') then -- optimize by only doing when gemination present
local orig_pron = orig_word[i]
local deac = strsub(pron, accents, '')
local orig_deac = strsub(orig_pron, accents, '')
-- the following two are optimizations to reduce the number of regex
-- checks in the majority of cases with words not beginning with ne-
-- or po-.
local is_ne = strfind(orig_deac, '⁀ne')
local is_po = strfind(orig_deac, '⁀po')
for _, gempref in ipairs(geminate_pref) do
local newspell = gempref[1]
local oldspell = gempref[2]
-- FIXME! The rsub below will be incorrect if there is
-- gemination in a joined preposition or particle
if strfind(orig_deac, '⁀' .. oldspell) and strfind(deac, '⁀' .. newspell) or
is_po and strfind(orig_deac, '⁀po' .. oldspell) and strfind(deac, '⁀po' .. newspell) or
is_ne and strfind(orig_deac, '⁀ne' .. oldspell) and strfind(deac, '⁀ne' .. newspell) or
is_ne and strfind(orig_deac, '⁀nepo' .. oldspell) and strfind(deac, '⁀nepo' .. newspell) then
pron = strsub(pron, '(⁀[^‿⁀ː]*)ː', '%1ˑ')
end
end
end
--degemination, optional gemination
local thisgem = gem[i] or 'o'
if thisgem == 'y' then
-- leave geminates alone, convert ˑ to regular gemination; ˑ is a
-- special gemination symbol used at prefix boundaries that we
-- remove only when gem=n, else we convert it to regular gemination
pron = strsub(pron, 'ˑ', 'ː')
elseif thisgem == 'o' then
-- make geminates optional, except for ɕӂ, also ignore left paren
-- in (ː) sequence
pron = strsub(pron, '([^ɕӂ%(%)])[ːˑ]', '%1(ː)')
elseif thisgem == 'n' then
-- remove gemination, except for ɕӂ
pron = strsub(pron, '([^ɕӂ%(%)])[ːˑ]', '%1')
else
-- degeminate l's
pron = strsub(pron, '(l)ː', '%1')
-- preserve gemination between vowels immediately after the stress,
-- special gemination symbol ˑ also remains, ɕӂ remain geminated,
-- žn remain geminated between vowels even not immediately after
-- the stress, n becomes optionally geminated when after but not
-- immediately after the stress, ssk and zsk remain geminated
-- immediately after the stress, else degeminate; we signal that
-- gemination should remain by converting to special symbol ˑ,
-- then removing remaining ː not after ɕӂ and left paren; do
-- various subs repeatedly in case of multiple geminations in a word
-- 1. immediately after the stress
pron = strsubrep(pron, '(' .. vowels .. stress_accents .. '[^ɕӂ%(%)])ː(' .. vowels .. ')', '%1ˑ%2')
-- 2. remaining geminate n after the stress between vowels
pron = strsubrep(pron, '(' .. stress_accents .. '.-' .. vowels .. accents .. '?n)ː(' .. vowels .. ')', '%1(ː)%2')
-- 3. remaining ž and n between vowels
pron = strsubrep(pron, '(' .. vowels .. accents .. '?[žn])ː(' .. vowels .. ')', '%1ˑ%2')
-- 4. ž word initially before vowels (жжение, жжём, etc.)
pron = strsubrep(pron, '(⁀ž)ː(' .. vowels .. ')', '%1ˑ%2')
-- 5. ssk (and zsk, already normalized) immediately after the stress
pron = strsub(pron, '(' .. vowels .. stress_accents .. '[^' .. vow .. ']*s)ː(k)', '%1ˑ%2')
-- 6. eliminate remaining gemination, except for ɕː and ӂː
pron = strsub(pron, '([^ɕӂ%(%)])ː', '%1')
-- 7. convert special gemination symbol ˑ to regular gemination
pron = strsub(pron, 'ˑ', 'ː')
end
-- handle soft and hard signs, assimilative palatalization
-- 1. insert j before i when required
pron = strsub(pron, 'ʹi', 'ʹji')
-- 2. insert glottal stop after hard sign if required
pron = strsub(pron, 'ʺ([aɛiouy])', 'ʔ%1')
-- 3. (ь) indicating optional palatalization
pron = strsub(pron, '%(ʹ%)', '⁽ʲ⁾')
-- 4. assimilative palatalization of consonants when followed by
-- front vowels or soft sign
pron = strsub(pron, '([mnpbtdkgfvszxɣrl' .. PSEUDOCONS ..'])([ː()]*[eiäạëöüʹ])', '%1ʲ%2')
pron = strsub(pron, '([cĵ])([ː()]*[äạöüʹ])', '%1ʲ%2')
-- 5. remove hard and soft signs
pron = strsub(pron, "[ʹʺ]", "")
-- reduction of unstressed word-final -я, -е; but special-case
-- unstressed не, же. Final -я always becomes [ə]; final -е may
-- become [ə], [e], [ɪ] or [ɨ] depending on the part of speech and
-- the preceding consonants/vowels.
pron = strsub(pron, '[äạ]⁀', 'ə⁀')
pron = strsub(pron, '⁀nʲe⁀', '⁀nʲi⁀')
pron = strsub(pron, '⁀že⁀', '⁀žy⁀')
-- function to fetch the appropriate value for ending and part of
-- speech, handling aliases and defaults and converting 'e' to 'ê'
-- so that the unstressed [e] sound is preserved
local function fetch_e_sub(ending)
local sub = fetch_pos_property(i, ending)
if sub == 'e' then
-- add TEMPCFLEX (which will be converted to CFLEX) to preserve
-- the unstressed [e] sound, which will otherwise be converted
-- to [ɪ]; we do this instead of adding CFLEX directly because
-- we later convert some instances of the resulting 'e' to
-- 'i', and we don't want to do this when the user explicitly
-- wrote a Cyrillic е with a circumflex on it. [NOTE that
-- formerly applied when we added CFLEX directly: DO NOT
-- use ê here directly because it's a single composed char,
-- when we need the e and accent to be separate.]
return 'e' .. TEMPCFLEX
else
return sub
end
end
if new_final_e_code then
-- as requested by Atitarev, final unstressed -ɛ should be unreduced
pron = strsub(pron, 'ɛ⁀', 'ɛ' .. TEMPCFLEX .. '⁀')
-- handle substitutions in two parts, one for vowel+j+e sequences
-- and the other for cons+e sequences
pron = strsub(pron, vowels_c .. '(' .. accents .. '?j)ë⁀', function(v, ac)
local ty = v == 'o' and 'oe' or 've'
return v .. ac .. fetch_e_sub(ty) .. '⁀'
end)
-- consonant may palatalized, geminated or optional-geminated
pron = strsub(pron, '(.)(ʲ?[ː()]*)[eë]⁀', function(ch, mod)
local ty = ch == 'j' and 'je' or
strfind(ch, '[cĵšžĉĝ]') and 'hardsib' or
strfind(ch, '[čǰɕӂ]') and 'softsib' or
'softpaired'
return ch ..mod .. fetch_e_sub(ty) .. '⁀'
end)
if final_e_non_pausal then
-- final [e] should become [ɪ] when not followed by pause or
-- end of utterance (in other words, followed by space plus
-- anything but a pause symbol, or followed by tie bar).
pron = strsub(pron, 'e' .. TEMPCFLEX .. '⁀‿', 'i⁀‿')
if i < #word and word[i+1] ~= '⁀|⁀' then
pron = strsub(pron, 'e' .. TEMPCFLEX .. '⁀$', 'i⁀')
end
end
-- now convert TEMPCFLEX to CFLEX; we use TEMPCFLEX so the previous
-- two regexps won't affect cases where the user explicitly wrote
-- a circumflex
pron = strsub(pron, TEMPCFLEX, CFLEX)
else
-- Do the old way, which mostly converts final -е to schwa, but
-- has highly broken retraction code for vowel + [шжц] + е (but
-- not with accent on vowel!) before it that causes final -е in
-- this circumstance to become [ɨ], and a special hack for кое-.
pron = strsub(pron, vowels_c .. '([cĵšžĉĝ][ː()]*)[eë]', '%1%2ɛ')
pron = strsub(pron, '⁀ko(' .. stress_accents .. ')jë⁀', '⁀ko%1ji⁀')
pron = strsub(pron, '[eë]⁀', 'ə⁀')
end
-- retraction of е and и after цшж
pron = strsub(pron, '([cĵšžĉĝ][ː()]*)([ei])', function(a, b)
return a .. retracting[b] end)
--syllabify, inserting @ at syllable boundaries
--1. insert @ after each vowel
pron = strsub(pron, '(' .. vowels .. accents .. '?)', '%1@')
--2. eliminate word-final @
pron = strsub(pron, '@+⁀$', '⁀')
--3. move @ forward directly before any ‿⁀, as long as at least
-- one consonant follows that; we will move it across ‿⁀ later
pron = strsub(pron, '@([^@' .. vow .. acc .. ']*)([‿⁀]+[^‿⁀@' .. vow .. acc .. '])', '%1@%2')
--4. in a consonant cluster, move @ forward so it's before the
-- last consonant
pron = strsub(pron, '@([^‿⁀@' .. vow .. acc .. ']*)([^‿⁀@' .. vow .. acc .. 'ːˑ()ʲ]ʲ?[ːˑ()]*‿?[' .. vow .. acc .. '])', '%1@%2')
--5. move @ backward if in the middle of a "permanent onset" cluster,
-- e.g. sk, str, that comes before a vowel, putting the @ before
-- the permanent onset cluster
pron = strsub(pron, '([^‿⁀@_' .. vow .. acc .. ']?)(_*)([^‿⁀@_' .. vow .. acc .. '])(_*)@([^‿⁀@' .. vow .. acc .. 'ːˑ()ʲ])(ʲ?[ːˑ()]*[‿⁀]*[' .. vow .. acc .. '])', function(a, aund, b, bund, c, d)
if perm_syl_onset[a .. b .. c] or c == 'j' and strfind(b, '[čǰɕӂʲ]') then
return '@' .. a .. aund .. b .. bund .. c .. d
elseif perm_syl_onset[b .. c] then
return a .. aund .. '@' .. b .. bund .. c .. d
end end)
--6. if / is present (explicit syllable boundary), remove any @
-- (automatic boundary) and convert / to @
if strfind(pron, '/') then
pron = strsub(pron, '[^' .. vow .. acc .. ']+', function(x)
if strfind(x, '/') then
x = strsub(x, '@', '')
x = strsub(x, '/', '@')
end
return x
end)
end
--7. remove @ followed by a final consonant cluster
pron = strsub(pron, '@([^‿⁀@' .. vow .. ']+⁀)$', '%1')
--8. remove @ preceded by an initial consonant cluster (should only
-- happen when / is inserted by user or in цз, чж sequences)
pron = strsub(pron, '^(⁀[^‿⁀@' .. vow .. ']+)@', '%1')
--9. make sure @ isn't directly before linking ‿⁀
pron = strsub(pron, '@([‿⁀]+)', '%1@')
-- handle word-initial unstressed o and a; note, vowels always
-- followed by at least one char because of word-final ⁀
-- do after syllabification because syllabification doesn't know
-- about ɐ as a vowel
pron = strsub(pron, '^⁀[ao]([^' .. acc .. '])', '⁀ɐ%1')
--split by syllable
local syllable = strsplit(pron, '@', true)
--create set of 1-based syllable indexes of stressed syllables
--(acute, grave, circumflex)
local stress = {}
for j = 1, #syllable do
if strfind(syllable[j], stress_accents) then
stress[j] = "real"
elseif strfind(syllable[j], CFLEX) then
stress[j] = "cflex"
end
end
-- iterate syllable by syllable to handle stress marks, vowel allophony
local syl_conv = {}
for j = 1, #syllable do
local syl = syllable[j]
local alnum
--vowel allophony
if stress[j] then
-- convert acute/grave/circumflex accent to appropriate
-- IPA marker of primary/secondary/unmarked stress
alnum = 1
syl = strsub(syl, '(.*)́', 'ˈ%1')
syl = strsub(syl, '(.*)̀', 'ˌ%1')
syl = strsub(syl, CFLEX, '')
elseif stress[j+1] == "real" then
-- special-casing written а immediately before the stress,
-- but only for primary/secondary stress, not circumflex
alnum = 2
else
alnum = 3
end
syl = strsub(syl, vowels_c, function(a)
if a ~= '' then
return allophones[a][alnum]
end end)
syl_conv[j] = syl
end
pron = concat(syl_conv, "")
-- Optional (j) before ɪ, which is always unstressed; not following
-- consonant across a joined word boundary
pron = strsub(pron, '([^' .. ipa_vow .. ']⁀‿⁀)jɪ', '%1' .. TEMPSUB .. 'ɪ')
pron = strsub(pron, '⁀jɪ', '⁀(j)ɪ')
pron = strsub(pron, '([' .. ipa_vow .. '])jɪ', "%1(j)ɪ")
pron = strsub(pron, TEMPSUB, 'j')
--consonant assimilative palatalization of tn/dn/sn/zn, depending on
--whether [rl] precedes
pron = strsub(pron, '([rl]?)([ː()ˈˌ]*[dtsz])([ː()ˈˌ]*nʲ)', function(a, b, c)
if a == '' then
return a .. b .. 'ʲ' .. c
else
return a .. b .. '⁽ʲ⁾' .. c
end end)
--consonant assimilative palatalization of st/zd, depending on
--whether [rl] precedes
pron = strsub(pron, '([rl]?)([ˈˌ]?[sz])([ː()ˈˌ]*[td]ʲ)', function(a, b, c)
if a == '' then
return a .. b .. 'ʲ' .. c
else
return a .. b .. '⁽ʲ⁾' .. c
end end)
--general consonant assimilative palatalization
pron = strsubrep(pron, '([szntdpbmfcĵx])([ː()ˈˌ]*)([szntdpbmfcĵlk]ʲ)', function(a, b, c)
if cons_assim_palatal['compulsory'][a..c] then
return a .. 'ʲ' .. b .. c
elseif cons_assim_palatal['optional'][a..c] then
return a .. '⁽ʲ⁾' .. b .. c
else
return a .. b .. c
end end)
-- further assimilation before alveolopalatals
pron = strsub(pron, 'n([ː()ˈˌ]*)([čǰɕӂ])', 'nʲ%1%2')
-- optional palatal assimilation of вп, вб only word-initially
pron = strsub(pron, '⁀([ː()ˈˌ]*[fv])([ː()ˈˌ]*[pb]ʲ)', '⁀%1⁽ʲ⁾%2')
-- optional palatal assimilation of бв but not in обв-
pron = strsub(pron, 'b([ː()ˈˌ]*vʲ)', 'b⁽ʲ⁾%1')
if strfind(word[i], '⁀o' .. accents .. '?bv') then
-- ə in case of a word with a preceding preposition
pron = strsub(pron, '⁀([ː()ˈˌ]*[ɐəo][ː()ˈˌ]*)b⁽ʲ⁾([ː()ˈˌ]*vʲ)', '⁀%1b%2')
end
-- palatalized labials before /j/ should be optionally palatalized
pron = strsub(pron, '([mpbfv])ʲ([ːˈˌ]*j)', '%1⁽ʲ⁾%2')
-- Word-final -лся (normally in past verb forms) should have optional
-- palatalization. Need to rewrite as -лсьа to defeat this.
-- FIXME: Should we move this to phonetic_subs?
if strfind(word[i], 'ls[äạ]⁀') then
pron = strsub(pron, 'lsʲə⁀', 'ls⁽ʲ⁾ə⁀')
end
word[i] = pron
end
text = concat(word, " ")
-- Front a and u between soft consonants. If between a soft and
-- optionally soft consonant (should only occur in that order, shouldn't
-- ever have a or u preceded by optionally soft consonant),
-- split the result into two. We only split into two even if there
-- happen to be multiple optionally fronted a's and u's to avoid
-- excessive numbers of possibilities (and it simplifies the code).
-- 1. First, temporarily add soft symbol to inherently soft consonants.
text = strsub(text, '([čǰɕӂj])', '%1ʲ')
-- 2. Handle case of [au] between two soft consonants
text = strsubrep(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?.ʲ)', function(a, b, c)
return a .. fronting[b] .. c end)
-- 3. Handle [au] between soft consonant and optional j, which is still fronted
text = strsubrep(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?%(jʲ%))', function(a, b, c)
return a .. fronting[b] .. c end)
-- 4. Handle case of [au] between soft and optionally soft consonant
if strfind(text, 'ʲ[ː()]*[auʊ][ˈˌ]?.⁽ʲ⁾') then
local opt_hard = strsub(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?.)⁽ʲ⁾', '%1%2%3')
local opt_soft = strsub(text, '(ʲ[ː()]*)([auʊ])([ˈˌ]?.)⁽ʲ⁾', function(a, b, c)
return a .. fronting[b] .. c .. 'ʲ' end)
text = { opt_hard, opt_soft }
else
text = { text }
end
for i, pronunciation in ipairs(text) do
-- 5. Undo addition of soft symbol to inherently soft consonants.
pronunciation = strsub(pronunciation, '([čǰɕӂj])ʲ', '%1')
-- convert special symbols to IPA
pronunciation = strsub(pronunciation, '[cĵ]ʲ', translit_conv_j)
pronunciation = strsub(pronunciation, '[cčgĉĝĵǰšžɕӂ]', translit_conv)
-- Assimilation involving hiatus of ɐ and ə
pronunciation = strsub(pronunciation, 'ə([‿⁀]*)[ɐə]', 'ɐ%1ɐ')
-- Use ɫ for dark l
pronunciation = strsub(pronunciation, 'l([^ʲ])', 'ɫ%1')
-- eliminate ⁀ symbol at word boundaries
-- eliminate _ symbol that prevents assimilations
-- eliminate pseudoconsonant at beginning of suffixes or end of prefixes
text[i] = strsub(pronunciation, '[⁀_' .. PSEUDOCONS ..']', '')
end
return text
end
local function normalizar(t, gem_, pos_)
t = strlower(t)
if strfind(t, "[a-zščžáéíóúýàèìòùỳâêîôûŷạẹịọụỵȧėȯẏ]") then
error("El título o la ayuda deben estar en CIRILICO!")
end
--[[
if strfind(t, "[сз]ч") then
track("?")
end
if strfind(t, "[шж]ч") then
track("??")
end
if strfind(t, CFLEX) then
track("circun")
end
if strfind(t, DUBGR) then
track("doble tilde?")
end
]]--
t = strsub(t, "``", DUBGR)
t = strsub(t, "`", GR)
t = strsub(t, "@", DOTABOVE)
t = strsub(t, "%^", CFLEX)
t = strsub(t, DUBGR, CFLEX)
-- translit doesn't always convert э to ɛ (depends on whether a consonant
-- precedes), so do it ourselves before translit
t = strsub(t, 'э', 'ɛ')
-- vowel + йе should have double jj, but the translit module will translit
-- it the same as vowel + е, so do it ourselves before translit
t = strsub(t, '([' .. vowel .. ']' .. opt_accent .. ')й([еѐ])',
'%1йй%2')
-- transliterate and decompose Latin vowels with accents, recomposing
-- certain key combinations; don't include accent on monosyllabic ё, so
-- that we end up without an accent on such words. NOTE: Not clear we
-- need to be decomposing like this any more, although it is still
-- useful if the user supplies Latin text, which we allow (although
-- undocumented).
t = decompose(m_ru_translit.tr_after_fixes(t))
-- handle old ě (e.g. сѣдло́), ǒ (e.g. сѣ̈дла) and ǫ (e.g. ея̈)
t = t:gsub("ě", "e")
:gsub("ǒ", "o")
:gsub("ǫ", "o")
-- handle sequences of accents (esp from ё with secondary/tertiary stress)
t = strsub(t, accents .. '+(' .. accents .. ')', '%1')
t = strsubrep(t, PUNTUACION, " | ") -- convierto lo que delimite fragmentos a los IPA foot boundaries |
t = strsubrep(t, PUNTUACION_EXTRA, "") -- elimino la puntuación restante que haya quedado
-- necesitamos poner una seudoconsonante si es un prefijo o sufijo para que genere bien la pronunciación
t = strsub(t, '^%s*[%-‐]', PSEUDOCONS)
t = strsub(t, '[%-‐]%s*$', PSEUDOCONS)
t = strsubrep(t, "[%-‐]", " ") --los guiones pasan a ser espacios (austro-húngaro, franco-italiano)
t = strsubrep(t, "%s*|%s*|%s*", " | ") --finalmente, elimino las barras y espacios de más
t = strsubrep(t, "%s+", " ")
t = strstrip(t, "[%s|]+")
local gem, pos = {}, {}
local k = 1
for p in strmatchit(t, "[^%s]+") do
if p == "|" then
insert(gem, "o")
insert(pos, "def")
else
insert(gem, gem_[k])
insert(pos, pos_[k])
k = k + 1
end
end
-- Add primary stress to single-syllable words preceded or followed by
-- unstressed particle or preposition. Add "tertiary" stress to remaining
-- single-syllable words that aren't a particle, preposition, prefix or
-- suffix and don't already bear an accent (including force-reduction
-- accents, i.e. dot-above/dot-below); "tertiary stress" means a vowel is
-- treated as stressed for the purposes of vowel reduction but isn't
-- marked with a primary or secondary stress marker; we repurpose a
-- circumflex for this purpose. We need to preserve the distinction
-- between spaces and hyphens because (1) we only recognize certain
-- post-accentless particles following a hyphen (to distinguish e.g.
-- 'то' from '-то'); (2) we only recognize certain pre-accentless
-- particles preceding a space (to distinguish particles 'о' and 'а' from
-- spelled letters о and а, which should not be reduced); and (3) we
-- recognize hyphens for the purpose of marking unstressed prefixes and
-- suffixes.
local word = strsplit(t, "([ %-]+)")
for i = 1, #word do
-- check for single-syllable words that need a stress; they must meet
-- the following conditions:
-- 1. must not be an accentless word, which is any of the following:
-- 1a. in the "pre" class, or
if not word[i] == "|" and not (accentless['pre'][word[i]] or
-- 1b. in the "prespace" class if followed by space and another word, or
i < #word - 1 and accentless['prespace'][word[i]] and word[i+1] == " " or
-- 1c. in the "post" class if preceded by another word and
-- not followed by a hyphen (this is because words like
-- ка and же are also used for spelling initialisms), or
i > 2 and accentless['post'][word[i]] and word[i+1] ~= "-" or
-- 1d. in the "posthyphen" class preceded by a hyphen and another word
-- (and not followed by a hyphen, see 1c);
i > 2 and accentless['posthyphen'][word[i]] and word[i-1] == "-" and word[i+1] ~= "-") and
-- 2. must be one syllable;
strlen(strsub(word[i], '[^' .. vow .. ']', '')) == 1 and
-- 3. must not have any accents (including dot-above, forcing reduction);
not strfind(word[i], accents) and
-- 4. must not be a prefix or suffix, identified by a preceding or trailing hyphen, i.e. one of the following:
-- 4a. utterance-initial preceded by a hyphen, or
not (i == 3 and word[2] == "-" and word[1] == "" or
-- 4b. non-utterance-initial preceded by a hyphen, or
i >= 3 and word[i-1] == " -" or
-- 4c. utterance-final followed by a hyphen, or
i == #word - 2 and word[i+1] == "-" and word[i+2] == "" or
-- 4d. non-utterance-final followed by a hyphen;
i <= #word - 2 and word[i+1] == "- ") then
-- OK, we have a stressable single-syllable word; either add primary
-- or tertiary stress:
-- 1. add primary stress if preceded or followed by an accentless word,
if (i > 2 and accentless['pre'][word[i-2]] or
i > 2 and word[i-1] == " " and accentless['prespace'][word[i-2]] or
i < #word - 1 and accentless['post'][word[i+2]] and word[i+3] ~= "-" or
i < #word - 1 and word[i+1] == "-" and accentless['posthyphen'][word[i+2]] and word[i+3] ~= "-") then
word[i] = strsub(word[i], vowels_c, '%1' .. AC)
-- 2. else add tertiary stress
else
word[i] = strsub(word[i], vowels_c, '%1' .. CFLEX)
end
end
end
-- make unaccented prepositions and particles liaise with the following or
-- preceding word; in the process, fix up number of elements in gem/pos
-- tables so there's a single element for the combined word
local real_word_index = 0
for i = 1, #word do
if (i % 2) == 1 and word[i] ~= "|" then
real_word_index = real_word_index + 1
end
if i < #word - 1 and (accentless['pre'][word[i]] or accentless['prespace'][word[i]] and word[i+1] == " ") and
-- don't add ‿ onto the end of a prefix; a prefix is a word followed by a hyphen that is in turn
-- followed by a space or end of terms; note that ends of terms after a hyphen are marked by a blank
-- string due to the way capturing_split() works
not (word[i+1] == "-" and (word[i+2] == " " or word[i+2] == "" and i == #word - 2)) then
word[i+1] = '‿'
remove(gem, real_word_index)
remove(pos, real_word_index)
elseif i > 2 and (accentless['post'][word[i]] and word[i+1] ~= "-" or
accentless['posthyphen'][word[i]] and word[i-1] == "-" and word[i+1] ~= "-") then
word[i-1] = '‿'
-- for unaccented words that liaise with the preceding word,
-- remove the gemination spec corresponding to the unaccented word
-- because the gemination in question is almost certainly in the
-- preceding word, but remove the POS spec corresponding to the
-- preceding word because it's the final -е of the unaccented word
-- that the POS will refer to
remove(gem, real_word_index)
remove(pos, real_word_index - 1)
end
end
t = concat(word, "")
-- add a ⁀ at the beginning and end of every word and at close juncture
-- boundaries; we will remove this later but it makes it easier to do
-- word-beginning and word-end rsubs
t = strsub(t, ' ', '⁀ ⁀')
t = strstrip(t)
t = '⁀' .. t .. '⁀'
t = strsub(t, '‿', '⁀‿⁀')
return t, gem, pos
end
local function generar_pron(text, adj, gem, pos, zhpal, is_transformed)
if not is_transformed then
local origtext, transformed_text = m_ru_translit.apply_tr_fixes(text)
text = transformed_text
end
text, gem, pos = normalizar(text, gem, pos)
-- At this point, the spelling has been normalized (see the comment to
-- ru_ipa_main() below). Now we need to handle any pronunciation-spelling
-- variants (particularly, handling зж and жж, which have both
-- non-palatalized and palatalized variants except at prefix boundaries)
-- and convert each variant to IPA.
local alltext
-- If zž or žž occur not at a prefix boundary, then generate two variants,
-- the first with non-palatal [ʐː] and the second with [ʑː] (potentially
-- with nearby vowels affected appropriately for the palatalization
-- difference). But don't do this if zhpal=n.
if zhpal == 'n' or not strfind(text, 'ž') then
-- speed up the majority of cases where ž doesn't occur
alltext = {text}
else
-- First, go through and mark all prefix boundaries where a ž directly
-- follows the prefix by inserting a ˑ between prefix and ž. This
-- prevents us from generating the [ʑː] variant (notated internally as
-- ӂӂ). Don't do this if zhpal=y, which defeats this check.
if zhpal ~= 'y' then
for _, gempref in ipairs(geminate_pref) do
local origspell = gempref[2]
local is_zh = gempref[3]
if is_zh then
-- allow all vowels to have accents following them
origspell = strsub(origspell, vowels_c, '%1' .. accents .. '?')
text = strsub(text, '(⁀' .. origspell .. ')ž', '%1ˑž')
text = strsub(text, '(⁀po' .. origspell .. ')ž', '%1ˑž')
text = strsub(text, '(⁀ne' .. origspell .. ')ž', '%1ˑž')
text = strsub(text, '(⁀nepo' .. origspell .. ')ž', '%1ˑž')
end
end
end
-- Then, if zž or žž are present (which will exclude prefix boundaries
-- because a ˑ marker will intervene), generate the two possibilities,
-- else generate only one.
local alltext1
if strfind(text, '[zž]ž') then
alltext1 = {text, strsub(text, '[zž]ž', 'ӂӂ')}
else
alltext1 = {text}
end
-- Finally, remove the ˑ marker.
alltext = {}
for _, text in ipairs(alltext1) do
insert(alltext, strsub(text, 'ˑ', ''))
end
end
-- Now generate the pronunciation(s) for each of the spelling variants
-- we generate above. (In some cases there are multiple pronunciation
-- variants generated, e.g. in the sequence palatalized consonant + a/u +
-- optionally palatalized consonant.)
local allpron = {}
for _, t in ipairs(alltext) do
local thispron = ru_ipa_main(t, adj, gem, pos)
for _, pron in ipairs(thispron) do
insert(allpron, strhtml(pron))
end
end
return {allpron}
end
function export.procesar_pron_args(titulo, args)
local tit = titulo
local vino_ayuda, x
if #args["ayuda"] < 1 then
args["ayuda"][1] = tit
else
vino_ayuda = true
end
if #args["fone"] < 1 and #args["fono"] < 1 then
x = pron_abc[args["ayuda"][1]]
if x then
args["ayuda"] = x
args["tl"] = x
end
local A = #args["ayuda"]
local j = 1 -- indice de la ayuda
local k = 1 -- cantidad de pronunciaciones insertadas (máximo 9)
while k <= 9 and j <= A do
local gem, cg, zhpal, adjno, shtono = {}, {}, {}, false, false
local flags = args["ayudaextra"][j] and strsplit(args["ayudaextra"][j], ";") or {}
for _,flag in ipairs(flags) do
if flag == "gemsí" or flag == "gemsi" or flag == "gems" then
insert(gem, "y")
elseif flag == "gemno" or flag == "gemn" then
insert(gem, "n")
elseif flag == "gemop" then
insert(gem, "o")
elseif flag == "zhpalno" or flag == "zhpaln" then
insert(zhpal, "n")
elseif flag == "zhpalsí" or flag == "zhpalsi" or flag == "zhpals" then
insert(zhpal, "y")
elseif flag == "adjno" or flag == "adjn" then
adjno = true
elseif flag == "shtono" or flag == "shton" then
shtono = true
elseif pos_properties[flag] then
insert(cg , flag)
end
end
local origtext, transformed_text = m_ru_translit.apply_tr_fixes(args["ayuda"][j],
adjno, shtono)
args["tl"][j] = m_ru_translit.tr_after_fixes(transformed_text)
if vino_ayuda then
args["fgraf"][j] = {origtext}
end
local fone = generar_pron(transformed_text, nil, gem, cg, zhpal, true)
for i,_ in ipairs(fone) do
insert(args["fone"], fone[i])
k = k + 1
if k > 9 then
break
end
end
j = j + 1
end
end
local tiene_espacios = strfind(tit, "%s")
if args["fone"][1] and args["fone"][1][1] then
local rim = strsub(args["fone"][1][1], ".*%s([^%s]+)$", "%1") -- me quedo con la última palabra
rim = strsub(rim, "^.*ˈ(.-)$", "%1")
args["rima"][1] = strsub(rim, ".-".."(["..ipa_vow.."].*"..")".."$", "%1")
end
return args
end
return export