Main | Documentation | Draft | Usage |
This module implements {{Ko-rm}}. Some data stored in Module:Ko-rm/data.
-- based on Wiktionary [[wikt:Module:ko-translit]] [[wikt:Module:ko-pron]] [[wikt:Module:ko-pron/data]] [[wikt:Module:ko]]
local p = {}
local Ugsub = mw.ustring.gsub
local Umatch = mw.ustring.match
local Usub = mw.ustring.sub
local Uchar = mw.ustring.char
local codepoint = mw.ustring.codepoint
local data = mw.loadData('Module:Ko-rm/data')
local lib = require('Module:Feature')
local CleanTT = require('Module:Tt').CleanTT
function p.main(frame)
local args = require('Module:Arguments').getArgs(frame, {
parentFirst = true,
wrappers = {'Template:Ko-rm'}
})
if (not args[1]) then return '' end
return p._main(args)
end
function p._main(args)
local str = CleanTT(mw.text.unstrip(args[1]))
-- mw.logObject('input' .. args[1]) --debug
-- mw.logObject('cleaned' .. str) --debug
str = Ugsub(str, '%([一-鿿㐀-䶿𠀀-𰀀-]+%)', '')
str = Ugsub(str, "%([一-鿿㐀-䶿𠀀-𰀀-]*'''[一-鿿㐀-䶿𠀀-𰀀-]+'''[一-鿿㐀-䶿𠀀-𰀀-]*%)", '')
str = Ugsub(str, '[一-鿿㐀-䶿𠀀-𰀀-]+%((.-)%)', '%1')
str = Ugsub(str, '<sup.-※.-</sup>', '')
str = Ugsub(str, '<span.-title.->(.-)</span>', '%1')
str = Ugsub(str, '<[%w%p]+:(.-)>', '%1') --for manual readings
str = Ugsub(str, '<%->', '-') --for manual hyphenation
str = Ugsub(str, '< >', ' ') --for manual spacing
if not Umatch(str, '[가-힣]') then
return ''
end
--pronunciation exception(s)
str = Ugsub(str, '여덟', '여덜')
str = Ugsub(str, 'Ⅰ', '일')
str = Ugsub(str, 'Ⅱ', '이')
str = Ugsub(str, 'Ⅲ', '삼')
str = Ugsub(str, 'Ⅳ', '사')
str = Ugsub(str, 'Ⅴ', '오')
str = Ugsub(str, 'Ⅵ', '육')
str = Ugsub(str, 'Ⅶ', '칠')
str = Ugsub(str, 'Ⅷ', '팔')
str = Ugsub(str, 'Ⅸ', '구')
str = Ugsub(str, 'Ⅹ', '십')
-- pre-romanization punctuation conversion
str = Ugsub(str, '[《「『【]', '“')
str = Ugsub(str, '[》」』】]', '”')
local revised = p.romanize(str, args)
if (not revised) then
return ''
end
--mw.logObject(revised,'revised') --debug
if Umatch(revised, '[%.%?%!]') then
revised = mw.ustring.upper(Usub(revised, 1, 1)) .. Usub(revised, 2, -1)
revised = Ugsub(revised, "([%.%?%!]) ([a-z%'])", '%1 ^%2')
revised = Ugsub(revised, "^%'%'%'", "'''^")
end
revised = Ugsub(revised, "([a-z])%-%'([a-z])", '%1-%2')
revised = Ugsub(revised, "%^%'%'%'", "'''^")
revised = Ugsub(revised, '%^%l', mw.ustring.upper)
revised = Ugsub(revised, '%^', '')
revised = Ugsub(revised, "%-'''%-", "'''-")
revised = Ugsub(revised, '%-%-', '-')
--punctuation fixing
revised = Ugsub(revised, '…', '...')
revised = Ugsub(revised, '!', '!')
revised = Ugsub(revised, '?', '?')
revised = Ugsub(revised, '”([A-Za-z])', '”-%1')
revised = Ugsub(revised, '(//[^/@]-@@[^/@]-@@//)%-?([A-Za-z])', '%1-%2')
revised = Ugsub(revised, '[·・]', ' - ')
revised = Ugsub(revised, '——', '⸺')
revised = Ugsub(revised, '——', '⸺')
--secondary romanisation system
while revised:find('^.-//[^/]-//.-$') do
local pre, dur, dur3, post = string.match(revised, '^(.-)//([^@/]-)@@([^@/]-)@@//(.-)$')
if dur3 ~= nil then
--mw.logObject(pre,'pre') mw.logObject(dur,'dur') mw.logObject(post,'post') --debug
if mw.ustring.lower(dur3) ~= mw.ustring.lower(dur) then
dur = '<span style="border-bottom-width:1px; border-bottom-style:dotted; border-bottom-color:rgb(128, 128, 128); cursor:help;" title="Spelled: ' .. p.capitalizer(dur3, true):gsub('"','"') .. '">' .. p.capitalizer(dur, true) .. '</span>'
else
dur = p.capitalizer(dur, true)
end
else
pre, dur, post = string.match(revised, '^(.-)//([^/]-)//(.-)$')
end
revised = pre .. dur .. post
end
--all case (|capi=1) or sentence case (|sent=1)
if (args.capi or args.sent) then
revised = p.capitalizer(revised, (args.capi or nil))
end
--post-capitalization punctuation fixing
revised = Ugsub(revised, "”'", '”-')
revised = Ugsub(revised, '[“”]', '"')
revised = Ugsub(revised, '([%a])(%d+)', '%1-%2')
revised = Ugsub(revised, '(%d+)([%a])', '%1-%2')
revised = Ugsub(revised, '(%d+)-[Pp]x', '%1px') --lazy fix for accidental hyphenation of pixel amounts
return revised
end
function p.romanize(text_param, args)
local P, optional_params = {}, { 'nn', 'ni', 'bcred' }
for _, pm in ipairs(optional_params) do
P[pm] = { }
if args[pm] then
for pp in mw.text.gsplit(args[pm], ',') do P[pm][tonumber(pp) or pp] = 1 end
end
end
--mw.logObject(P,'P') --debug
local T_index, T_next_index = 0,0
local rom3 = false
text_param = Ugsub(text_param, '["](.)', '%1')
for primitive_word in mw.ustring.gmatch(text_param, '[%-ᄀ-ᄒ' .. 'ᅡ-ᅵ' .. 'ᆨ-ᇂ' .. "ㄱ-ㅣ가-힣' /「」%^]+") do
--mw.logObject(primitive_word,'primitive_word') --debug
--mw.logObject(text_param,'text_param') --debug
local the_original = primitive_word
primitive_word = Ugsub(primitive_word, "'''", 'ß')
local bold_position, bold_count = {}, 0
while Umatch(primitive_word, 'ß') do
bold_position[(mw.ustring.find(primitive_word, 'ß')) + bold_count] = true
primitive_word = Ugsub(primitive_word, 'ß', '', 1)
bold_count = bold_count + 1
end
local word_set = { primitive_word }
local word_set_romanisations = {}
for _, respelling in ipairs(word_set) do
--mw.logObject(word_set,'word_set') --debug
--mw.logObject(respelling,'respelling') --debug
local decomposed_syllables = p.decompose_syllable(respelling)
--mw.logObject(decomposed_syllables,'decomposed_syllables') --debug
local romanisation = {}
local romanisation3 = {}
local bold_insert_count = 0
for index = 0, #decomposed_syllables, 1 do
if index ~= 0 then T_index = T_index + 1 end
local this_syllable_text = index ~= 0 and Usub(respelling, index, index) or ''
local forced = ''
--mw.logObject(this_syllable_text,'this_syllable_text_I') --debug
while Umatch(this_syllable_text, '[/「」^]') do
forced = forced .. this_syllable_text
respelling = Usub(respelling, 2, -1)
this_syllable_text = index ~= 0 and Usub(respelling, index, index) or ''
end
--mw.logObject(forced,'forced') --debug
if (forced:find('//') and (not rom3)) then
rom3 = true
elseif forced:find('//') then
rom3 = false
end
--mw.logObject(this_syllable_text,'this_syllable_text_F') --debug
if this_syllable_text == '-' then
if ((not rom3) and #romanisation3 > 0) then
table.remove(romanisation3)
table.remove(romanisation3)
table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')
romanisation3 = {}
table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), forced)
end
else
T_next_index = T_index
local syllable = decomposed_syllables[index] or { initial = 'Ø', vowel = 'Ø', final = 'X' }
local next_index = index
local next_syllable_text
local saw_hyphen_after = false
while true do
next_index = next_index + 1
T_next_index = T_next_index + 1
next_syllable_text = next_index > #decomposed_syllables and '' or Usub(respelling, next_index, next_index)
if next_syllable_text ~= '-' then
break
end
saw_hyphen_after = true
end
local next_syllable = decomposed_syllables[next_index] or { initial = 'Ø', vowel = 'Ø', final = 'Ø' }
syllable.final = data.FSC[syllable.final] or syllable.final
if this_syllable_text == '넓' then
if Umatch(next_syllable.initial, '[ᄌᄉ]') then
syllable.final = 'ᆸ'
elseif next_syllable.initial == 'ᄃ' then
if Umatch(next_syllable.vowel, '[^ᅡᅵ]') then
syllable.final = 'ᆸ'
end
end
end
local vowel = data.vowels[syllable.vowel][2]
if P.nn[T_next_index] and Umatch(syllable.final .. next_syllable.initial, 'ᆫᄅ') then
next_syllable.initial = 'ᄂ'
end
if P.ni[T_next_index] and next_syllable.initial == 'ᄋ' and Umatch(next_syllable.vowel, '[ᅵᅣᅧᅭᅲ]') then
next_syllable.initial = 'ᄂ'
end
if P.bcred[T_index] then
syllable.final = data.boundary[syllable.final .. '-Ø'][1]
end
if index ~= 0 and this_syllable_text == '밟' and not
Umatch(next_syllable.initial, '[ᄋᄒ]') then
syllable.final = 'ᆸ'
end
if Umatch(this_syllable_text, '[닭뷁삵슭앍줅찱칡탉흙]') and not
Umatch(next_syllable.initial .. ';' .. next_syllable.vowel, 'ᄋ;[ᅦᅧᅳᅴᅵ]') then
syllable.final = 'ᆨ'
end
if next_syllable_text == '없' then
if Umatch(syllable.final, '[ᆩᆪᆰᆿ]') then
syllable.final = 'ᆨ'
elseif Umatch(syllable.final, '[ᆬᆭ]') then
syllable.final = 'ᆫ'
elseif Umatch(syllable.final, '[ᆺᆻᆽᆾᇀ]') then
syllable.final = 'ᆮ'
elseif Umatch(syllable.final, '[ᆲᆳᆴᆶ]') then
syllable.final = 'ᆯ'
elseif syllable.final == 'ᆱ' then
syllable.final = 'ᆷ'
elseif Umatch(syllable.final, '[ᆵᆹᇁ]') then
syllable.final = 'ᆸ'
end
end
if (not P.bcred[T_index]) then
if Umatch(syllable.final .. next_syllable.initial, 'ᇀᄋ') then
if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
syllable.final = 'ᆾ'
end
elseif Umatch(syllable.final .. next_syllable.initial, 'ᆴᄋ') then
if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
syllable.final = 'ᆯ'
next_syllable.initial = 'ᄎ'
end
elseif Umatch(syllable.final .. next_syllable.initial, 'ᆮᄋ') and tonumber(s_variation or -1) ~= index then
if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
syllable.final = 'ᆽ'
end
elseif Umatch(syllable.final .. next_syllable.initial, 'ᆮᄒ') then
if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
syllable.final = 'ᆾ'
next_syllable.initial = 'ᄋ'
end
end
end
if syllable.final .. next_syllable.initial == 'ᆺᄋ' and not
Umatch(next_syllable_text, '[아았어었에으은을음읍의이인일임입있]') then
syllable.final = 'ᆮ'
end
local bound = syllable.final .. '-' .. next_syllable.initial
if (not data.boundary[bound]) then
mw.log('No boundary data for ' .. bound .. '.')
return nil
end
local junction = data.boundary[bound][2]
local junction3 = data.boundary[bound][3] or data.boundary[bound][2]
--mw.logObject(junction, 'junction') --debug
--mw.logObject(junction3, 'junction3') --debug
if bold_position[index + bold_insert_count + 1] then
junction = Ugsub(junction, '^.*$', function(matched)
local a, b = string.match(matched, '^(ng);(.*)$')
if ((not a) and (not b)) then a, b = string.match(matched, '^(.?%-?);(.*)$') end
return Umatch(syllable.final .. next_syllable.initial, '^Ø?[ᄀ-ᄒ]$')
and "'''" .. (a or '') .. ';' .. (b or '')
or (a or '') .. "'''" .. ';' .. (b or '') end)
bold_insert_count = bold_insert_count + 1
end
local final_cons, initial_cons = Umatch(junction, '^(.*);(.*)$')
--special romanisation
if rom3 then
if (#romanisation3 == 0 and #romanisation > 0) then
table.insert(romanisation3, romanisation[#romanisation])
end
local final_cons3, initial_cons3 = Umatch(junction3, '^(.*);(.*)$')
table.insert(romanisation3, vowel)
table.insert(romanisation3, final_cons3)
table.insert(romanisation3, (saw_hyphen_after and '-' or ''))
table.insert(romanisation3, initial_cons3)
elseif ((not rom3) and #romanisation3 > 0) then
table.remove(romanisation3)
table.remove(romanisation3)
table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')
romanisation3 = {}
end
table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), forced)
table.insert(romanisation, vowel)
table.insert(romanisation, final_cons)
table.insert(romanisation, (saw_hyphen_after and '-' or ''))
table.insert(romanisation, initial_cons)
--straggler characters at end of word set
if index == #decomposed_syllables and lib.isNotEmpty(Usub(respelling, index+1, index+1)) then
local N = Usub(respelling, index+1, #respelling)
if (N:find('//') and #romanisation3 > 0) then
table.remove(romanisation3)
table.remove(romanisation3)
table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')
romanisation3 = {}
table.insert(romanisation, N)
else
romanisation3 = {}
end
end
--[[
local currRom = {
syllable = syllable,
vowel = vowel,
final_cons = final_cons,
initial_cons = initial_cons,
totalRom = table.concat(romanisation),
totalRom3 = table.concat(romanisation3)
} --debug
mw.logObject(currRom,'currRom') --debug
--]]
end
end
local temp_romanisation = table.concat(romanisation)
--mw.logObject(temp_romanisation,'temp_romanisation') --debug
for i = 1, 2 do
temp_romanisation = Ugsub(temp_romanisation, '(.)…(.)', function(a, b)
return a .. (data.AI[a .. b] and "'" or '') .. b end)
temp_romanisation = Ugsub(temp_romanisation, "wo'e", 'woe')
temp_romanisation = Ugsub(temp_romanisation, "yo'e", 'yoe')
temp_romanisation = Ugsub(temp_romanisation, "we'o", 'weo')
temp_romanisation = Ugsub(temp_romanisation, "we'u", 'weu')
temp_romanisation = Ugsub(temp_romanisation, "ye'u", 'yeu')
temp_romanisation = Ugsub(temp_romanisation, "yu'i", 'yui')
end
table.insert(word_set_romanisations, temp_romanisation)
end
text_param = Ugsub(
text_param,
p.pattern_escape(the_original),
table.concat(word_set_romanisations, '/'),
1
)
end
return text_param
end
function p.decompose_jamo(syllable)
if (not Umatch(syllable, '[가-힣]')) then
if Umatch(syllable, '[ᄀ-ᄒ]') then
return { initial = syllable, vowel = 'Ø', final = 'Ø' }
elseif Umatch(syllable, '[ᅡ-ᅵ]') then
return { initial = 'Ø', vowel = syllable, final = 'Ø' }
elseif Umatch(syllable, '[ᆨ-ᇂ]') then
return { initial = 'Ø', vowel = 'Ø', final = syllable }
elseif Umatch(syllable, '[ㄱ-ㆎ]') then
return { initial = 'Ø', vowel = 'Ø', final = syllable }
else
return { initial = 'Ø', vowel = ' ', final = 'X' }
end
end
local cp = codepoint(syllable)
if (not cp) then return { '', '', '' } end
local relative_cp = cp - 0xAC00
local jongseong = (((relative_cp % 28) ~= 0) and Uchar(0x11A7 + (relative_cp % 28))) or ''
local jungseong = Uchar(0x1161 + math.floor((relative_cp % 588) / 28))
local choseong = Uchar(0x1100 + math.floor(relative_cp / 588))
return {
initial = choseong,
vowel = jungseong,
final = jongseong
}
end
function p.pattern_escape(text)
if type(text) == 'table' then
text = text.args[1]
end
text = Ugsub(text, '([%^$()%%.%[%]*+%-?])', '%%%1')
return text
end
function p.decompose_syllable(word)
local decomposed_syllables = {}
for syllable in mw.text.gsplit(word, '') do
--mw.logObject(syllable,'syllable') --debug
if not Umatch(syllable, '[/「」%^]') then
table.insert(decomposed_syllables, p.decompose_jamo(syllable))
end
end
return decomposed_syllables
end
function p.capitalizer(str, all)
if lib.isNotEmpty(str) then
str = mw.text.split(str,'')
--mw.logObject(str,'str') --debug
local cap = true
for index = 1,#str do
if (str[index]:find(((all ~= nil) and "[^A-Za-z\-\"_#&]" or "[^A-Za-z\-\"_,%s#&]")) and str[index] ~= "'") or (cap and str[index] == ' ')then
cap = true
--mw.logObject(str[index],'skipped') --debug
elseif cap and str[index] == '_' then
cap = false
str[index] = ''
elseif cap then
str[index] = mw.ustring.upper(str[index])
--mw.logObject(str[index],'capped') --debug
cap = false
end
end
str = table.concat(str,'')
end
return str
end
function p.strip(str)
if lib.isEmpty(str) then return '' end
str = Ugsub(str, '//(.-)//', '%1' ) --remove given name specifier
str = Ugsub(str, '%^', '' ) --remove capitalization marker
str = Ugsub(str, '<.>', '' ) --remove arbitrary separator
str = Ugsub(str, '_', '' ) --remove capitalization blacklister
str = Ugsub(str, '<([%w%p]+):.->', '%1' ) --reduce manual readings to just the text
return str
end
return p