Module:data consistency check

From Wiktionary, the free dictionary
Jump to navigation Jump to search

This module checks the validity and internal consistency of the language, language family, and script data used on Wiktionary: the modules in Category:Language data modules as well as Module:scripts/data.

Output

Discrepancies detected:

  • Literary Chinese, the canonical name for the code lzh-lit, is wrong; it should be Literary Chinese.
  • The code oos and the canonical name Old Ossetic should be removed; they are not found in Module:etymology languages/data.
  • Literary Chinese, the canonical name for the code lzh-lit, is wrong; it should be Literary Chinese.
  • The code oos and the canonical name Old Ossetic should be removed; they are not found in Module:etymology languages/data.
  • Literary Chinese language (lzh-lit) has a canonical name that is not unique; it is also used by the code lzh.
  • The data key preprocess_links for ??? (th-new) is invalid.
  • The code ira-mid and the canonical name Middle Iranian should be removed; they are not found in Module:families/data.
  • The code ira-old and the canonical name Old Iranian should be removed; they are not found in Module:families/data.
  • The code ira-mid and the canonical name Middle Iranian should be removed; they are not found in Module:families/data.
  • The code ira-old and the canonical name Old Iranian should be removed; they are not found in Module:families/data.
  • Proto-Central Togo language (alv-gtm-pro) does not have the expected name "Proto-Ghana-Togo Mountain", even though it is the proto-language of the Ghana-Togo Mountain languages (alv-gtm).
  • Proto-Arawa language (auf-pro) does not have the expected name "Proto-Arauan", even though it is the proto-language of the Arauan languages (auf).
  • Proto-Amuesha-Chamicuro language (awd-amc-pro) has a proto-language code associated with the invalid code awd-amc.
  • Proto-Kampa language (awd-kmp-pro) has a proto-language code associated with the invalid code awd-kmp.
  • Proto-Arawak language (awd-pro) does not have the expected name "Proto-Arawakan", even though it is the proto-language of the Arawakan languages (awd).
  • Proto-Paresi-Waura language (awd-prw-pro) has a proto-language code associated with the invalid code awd-prw.
  • Proto-Ta-Arawak language (awd-taa-pro) does not have the expected name "Proto-Ta-Arawakan", even though it is the proto-language of the Ta-Arawakan languages (awd-taa).
  • Proto-Rukai language (dru-pro) has a proto-language code associated with Rukai (dru), which is not a family.
  • Proto-Basque language (euq-pro) does not have the expected name "Proto-Vasconic", even though it is the proto-language of the Vasconic languages (euq).
  • Proto-Norse language (gmq-pro) does not have the expected name "Proto-North Germanic", even though it is the proto-language of the North Germanic languages (gmq).
  • Proto-Kamta language (inc-krn-pro) does not have the expected name "Proto-KRNB lects", even though it is the proto-language of the KRNB lects (inc-krn).
  • Proto-Chumash language (nai-chu-pro) does not have the expected name "Proto-Chumashan", even though it is the proto-language of the Chumashan languages (nai-chu).
  • Proto-Maidun language (nai-mdu-pro) does not have the expected name "Proto-Maiduan", even though it is the proto-language of the Maiduan languages (nai-mdu).
  • Proto-Mixe-Zoque language (nai-miz-pro) does not have the expected name "Proto-Mixe-Zoquean", even though it is the proto-language of the Mixe-Zoquean languages (nai-miz).
  • Proto-Pomo language (nai-pom-pro) does not have the expected name "Proto-Pomoan", even though it is the proto-language of the Pomoan languages (nai-pom).
  • Proto-Mazatec language (omq-maz-pro) does not have the expected name "Proto-Mazatecan", even though it is the proto-language of the Mazatecan languages (omq-maz).
  • Proto-Ossetic language (os-pro) has a proto-language code associated with Ossetian (os), which is not a family.
  • Proto-North Sarawak language (poz-swa-pro) does not have the expected name "Proto-North Sarawakan", even though it is the proto-language of the North Sarawakan languages (poz-swa).
  • Proto-Salish language (sal-pro) does not have the expected name "Proto-Salishan", even though it is the proto-language of the Salishan languages (sal).
  • Proto-Samic language (smi-pro) does not have the expected name "Proto-Sami", even though it is the proto-language of the Sami languages (smi).
  • Proto-Kuki-Chin language (tbq-kuk-pro) does not have the expected name "Proto-Kukish", even though it is the proto-language of the Kukish languages (tbq-kuk).
  • Proto-Saka language (xsc-sak-pro) does not have the expected name "Proto-Sakan", even though it is the proto-language of the Sakan languages (xsc-sak).
  • Proto-Sarmatian language (xsc-sar-pro) has a proto-language code associated with the invalid code xsc-sar.
  • Blissymbols script (Blis) is not used by any language and has no characters listed for auto-detection.
  • Cypro-Minoan script (Cpmn) is not used by any language.
  • Hiragana script (Hira) is not used by any language.
  • Kana script (Hrkt) is not used by any language.
  • Image-rendered script (Image) is not used by any language and has no characters listed for auto-detection.
  • International Phonetic Alphabet script (Ipach) is not used by any language and has no characters listed for auto-detection.
  • Moon script (Moon) is not used by any language and has no characters listed for auto-detection.
  • Morse code (Morse) is not used by any language and has no characters listed for auto-detection.
  • Musical notation script (Music) is not used by any language.
  • Unspecified script (None) is not used by any language and has no characters listed for auto-detection.
  • Rongorongo script (Roro) is not used by any language and has no characters listed for auto-detection.
  • Rumi numerals script (Rumin) is not used by any language.
  • flag semaphore (Semap) is not used by any language and has no characters listed for auto-detection.
  • Visible Speech script (Visp) is not used by any language and has no characters listed for auto-detection.
  • mathematical notation script (Zmth) is not used by any language.
  • symbol script (Zsym) is not used by any language.
  • undetermined script (Zyyy) is not used by any language and has no characters listed for auto-detection.
  • uncoded script (Zzzz) is not used by any language and has no characters listed for auto-detection.
  • The codes fa-Arab, ug-Arab, ks-Arab, ps-Arab, ur-Arab, tt-Arab, ota-Arab, ku-Arab, mzn-Arab and sd-Arab are currently alias codes. Only one code should be used in the data.
  • The codes ms-Arab and kk-Arab are currently alias codes. Only one code should be used in the data.
  • The data key sort_by_scraping for Japanese script (Jpan) is invalid.

Checks performed

For multiple data modules:

  • Codes for languages, families and etymology-only languages must be unique and cannot clash with one another.
  • Canonical names for languages, families, and etymology-only languages must not be found in the list of other names.
  • Each name in the list of other names must appear only once.
  • otherNames, if present, must be an array.
  • Wikidata item IDs must be a positive integer or a string starting with Q and ending with decimal digits.

The following must be true of the data used by Module:languages:

  • Each code must be defined in the correct submodule according to whether it is two-letter, three-letter or exceptional.
  • The canonical name (field 1) must be present and must not be the same as the canonical name of another language.
  • If field 2 is not nil, it must a valid Wikidata item ID.
  • If field 3 or family is given and not nil, it must be a valid family code.
  • If field 4 or scripts is given and not nil, it must be an array, and each string in the array must be a valid script code.
  • If ancestors is given, it must be an array, and each string in the array must be a valid language or etymology language code.
  • If family is given, it must be a valid family code.
  • If type is given, it must be one of the recognised values (regular, reconstructed, appendix-constructed).
  • If entry_name is given, it must be a table that contains either two arrays (from and to) or a string (remove_diacritics) or both.
  • If sort_key is given, it may either be a string, or at table that in turn contains either two arrays (from and to) or a string (remove_diacritics).
  • If entry_name or sort_key is given, the from array must be longer or equal in length to the to array.
  • If standardChars is given, it must form a valid Lua string pattern when placed between square brackets with ^ before it ("[^...]). (It should match all characters regularly used in the language, but that cannot be tested.)
  • If override_translit is set, translit must also be set, because there must be a transliteration module that can override manual transliteration.
  • If link_tr is present, it must be true.
  • Have no data keys besides these: 1, 2, 3, "entry_name", "sort_key", "display", "otherNames", "aliases", "varieties", "type", "scripts", "ancestors", "wikimedia_codes", "wikipedia_article", "standardChars", "translit", "override_translit", "link_tr".

Checks not performed:

  • If translit is present, it should be the name of a module, and this module should contain a tr function that takes a pagename (and optionally a language code and script code) as arguments.
  • If sort_key is a string, it should be the name of a module, and this module should contain a makeSortKey function that takes a pagename (and optionally a language code and script code) as arguments.
  • If entry_name or sort_key is a table and contains a field remove_diacritics, the value of the field should be a string that forms a valid Lua pattern when it is placed inside negated set notation ([^...]).

These are not checked here, because module errors will quickly crop up in entries if these conditions are not met, assuming that Module:utilities attempts to generate a sortkey for a category pertaining to the language in question, or full_link attempts to use the transliteration module.

Module:languages/code to canonical name and Module:languages/canonical names must contain all the codes and canonical names found in the data submodules of Module:languages, and no more.

The following must be true of the data used by Module:etymology languages:

  • canonicalName must be given.
  • parent must be given must be a valid language, family or etymology-only language code.
  • If ancestors is given, it must be an array, and each string in the array must be a valid language or etymology language code. The etymology language should also be listed as the ancestor of a regular language.
  • Have no data keys besides these: "canonicalName", "otherNames", "parent", "ancestors", "wikipedia_article", "wikidata_item".

Codes in Module:families data must:

  • Have canonicalName, which must not be the same as the canonical name of another family.
  • If family is given, it must be a valid family code.
  • Have at least one language or subfamily belonging to it.
  • Have no data keys besides these: "canonicalName", "otherNames", "family", "protoLanguage", "wikidata_item".

Codes in Module:scripts data must:

  • Have canonicalName.
  • Have at least one language that lists it as one of its scripts.
  • Have a characters pattern for script autodetection, and this must form a valid Lua string pattern when placed between square brackets ("[...]"). (It should match all characters in the script, but that cannot be tested.)
  • Have no data keys besides these: "canonicalName", "otherNames", "parent", "systems", "wikipedia_article", "characters", "direction".

-- TODO:
	-- ietf_subtag field used with a 2/3-letter langauge/family code except qaa-qtz, or a 4-letter script code.
	-- Check against files containing up-to-date ISO data, to cross-check validity.

local m_languages = require("Module:languages")
local m_language_data = require("Module:languages/data/all")
local m_language_codes = require("Module:languages/code to canonical name")
local m_language_canonical_names = require("Module:languages/canonical names")
local m_etym_language_data = require("Module:etymology languages/data")
local m_etym_language_codes = require("Module:etymology languages/code to canonical name")
local m_etym_language_canonical_names = require("Module:etymology languages/canonical names")
local m_family_data = require("Module:families/data")
local m_family_codes = require("Module:families/code to canonical name")
local m_family_canonical_names = require("Module:families/canonical names")
local m_scripts = require("Module:scripts")
local m_script_data = require("Module:scripts/data")
local m_links = require("Module:links")

local m_script_utils = require("Module:script utilities")
local m_str_utils = require("Module:string utilities")
local m_table = require("Module:table")
local Array = require("Module:array")

local codepoint = m_str_utils.codepoint
local concat = table.concat
local dump = mw.dumpObject
local gcodepoint = m_str_utils.gcodepoint
local get_lang = m_languages.getByCode
local insert = table.insert
local list_to_text = mw.text.listToText
local new_title = mw.title.new
local split = m_str_utils.split
local ugmatch = m_str_utils.gmatch
local umatch = m_str_utils.match

local export = {}
local messages

local function discrepancy(modname, ...)
	local ok, result = pcall(function(...) messages[modname]:insert(string.format(...)) end, ...)
	if not ok then
		mw.log(result, ...)
	end
end

local all_codes = {}

local language_names = {}
local etym_language_names = {}
local family_names = {}
local script_names = {}

local nonempty_families = {}
local allowed_empty_families = {tbq = true}
local nonempty_scripts = {}
	
do
	local function link_lang(name)
		if name:find("[Ll]anguage$") then
			return "[[:Category:" .. name .. "|" .. name .. "]]"
		else
			return "[[:Category:" .. name .. " language|" .. name .. " language]]"
		end
	end
	
	local function link_etym_lang(name)
		if name:find("[Ll]anguage$") then
			return name
		else
			return name .. " language"
		end
	end
	
	local function link_family(name)
		if name:match("[Ll]anguages$") or name:match("[Ll]ects$") then
			return "[[:Category:" .. name .. "|" .. name .. " family]]"
		else
			return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
		end
	end
	
	function export.link(data)
		if not data[1] then
			return "???"
		end
		local type = data.type
		return type:match("etymology%-only") and link_etym_lang(data[1]) or
			type:match("family") and link_family(data[1]) or
			link_lang(data[1])
	end
end
local link = export.link
	
local function link_script(name)
	if not name then
		return "???"
	elseif name:find("[Cc]ode$") or name:find("[Ss]emaphore$") then
		return "[[:Category:" .. name:gsub("^%l", string.upper) .. "|" .. name .. "]]"
	else
		return "[[:Category:" .. name .. " script|" .. name .. " script]]"
	end
end

local function invalid_keys_message(modname, code, data, invalid_keys, is_script)
	local plural = #invalid_keys ~= 1
	discrepancy(modname, "The data key%s %s for %s (<code>%s</code>) %s invalid.",
		plural and "s" or "",
		invalid_keys
			:map(
				function(key)
					return "<code>" .. key .. "</code>"
				end)
			:concat(", "),
		(is_script and link_script or link)(data[1]),
		code,
		plural and "are" or "is")
end

local function check_data_keys(valid_keys, is_script)
	valid_keys = Array(valid_keys):to_set()
	
	return function (modname, code, data)
		local invalid_keys
		for k in pairs(data) do
			if not valid_keys[k] then
				invalid_keys = invalid_keys or Array()
				invalid_keys:insert(k)
			end
		end
		if invalid_keys then
			invalid_keys_message(modname, code, data, invalid_keys, is_script)
		end
	end
end

-- Modification of isArray in [[Module:table]].
-- This assumes all keys are either integers or non-numbers.
-- If there are fractional numbers, the results might be incorrect.
-- For instance, find_gap{"a", "b", [0.5] = true} evaluates to 3, but there
-- isn't a gap at 3 in the sense of there being an integer key greater than 3.
local function find_gap(t, can_contain_non_number_keys)
	local i = 0
	for k in pairs(t) do
		if not (can_contain_non_number_keys and type(k) ~= "number") then
			i = i + 1
			if t[i] == nil then
				return i
			end
		end
	end
end

local function check_true_or_string_or_nil(modname, code, data, field_name)
	local field = data[field_name]
	if not (field == nil or field == true or type(field) == "string") then
		discrepancy(modname,
			"%s (<code>%s</code>) has an <code>%s</code> value that is not <code>nil</code>, <code>true</code> or a string: <code>%s</code>",
			link(data), code, field_name,
			dump(data[field_name])
		)
	end
end

local function check_array(modname, code, canonical_name, data, array_name, subarray_name, can_contain_non_number_keys)
	local subtable = data
	if subarray_name then
		subtable = assert(data[subarray_name], subarray_name)
	end
	local array_type = type(subtable[array_name])
	if array_type == "table" then
		local gap = find_gap(subtable[array_name], can_contain_non_number_keys)
		if gap then
			discrepancy(modname, "The %s array in %sthe data table for %s (<code>%s</code>) has a gap at index %d.",
				array_name,
				subarray_name and "the " .. subarray_name .. " field in " or "",
				canonical_name,
				code, gap)
		else
			return true
		end
	else
		discrepancy(modname, "The %s field in %sthe data table for %s (<code>%s</code>) should be an array (table) but is %s.",
			array_name,
			subarray_name and "the " .. subarray_name .. " field in " or "",
			canonical_name,
			code,
			array_type == "nil" and "nil" or "a " .. array_type)
	end
end

local function check_no_alias_codes(modname, mod_data)
	local lookup, discrepancies = {}, {}
	for k, v in pairs(mod_data) do
		local check = lookup[v]
		if check then
			discrepancies[check] = discrepancies[check] or {"<code>" .. check .. "</code>"}
			insert(discrepancies[check], "<code>" .. k .. "</code>")
		else
			lookup[v] = k
		end
	end
	for _, v in pairs(discrepancies) do
		discrepancy(modname, "The codes " .. list_to_text(v, ", ", " and ") .. " are currently alias codes. Only one code should be used in the data.")
	end
end

local function check_wikidata_item(modname, code, data, key)
	local data_item = data[key]
	if data_item == nil then
		return
	elseif type(data_item) == "number" then
		if not require "Module:table".isPositiveInteger(data_item) then
			discrepancy(modname, "%g, the Wikidata item id for %s (<code>%s</code>), is not a positive integer or a string in the correct format.",
				data_item, data[1], code)
		end
	elseif type(data_item) == "string" then
		if not data_item:find "^Q%d+$" then
			discrepancy(modname, "%s, the Wikidata item id for %s (<code>%s</code>), is not a string in the correct format or a positive integer.",
				data_item, data[1], code)
		end
	end
end

local function check_other_names_or_aliases(modname, code, canonical_name, data, data_key, allow_nested)
	local array = data[data_key]
	if not array then
		return
	end
	check_array(modname, code, canonical_name, data, data_key, nil, true)

	local names = {}
	local function check_other_name(other_name)
		if other_name == canonical_name then
			discrepancy(modname,
				"%s, the canonical name for <code>%s</code>, is repeated in the table of <code>%s</code>.",
				canonical_name, code, data_key)
		end
		if names[other_name] then
			discrepancy(modname,
				"The name %s is found twice or more in the list of <code>%s</code> for %s (<code>%s</code>).",
				other_name, data_key, canonical_name, code)
		end
		names[other_name] = true
	end

	for _, other_name in ipairs(array) do
		if type(other_name) == "table" then
			if not allow_nested then
				discrepancy(modname,
					"A nested table is found in the list of <code>%s</code> for %s (<code>%s</code>), but isn't allowed.",
					data_key, canonical_name, code)
			else
				for _, on in ipairs(other_name) do
					check_other_name(on)
				end
			end
		else
			check_other_name(other_name)
		end
	end
end

local function check_other_names_aliases_varieties(modname, code, canonical_name, data)
	if data.otherNames then
		check_other_names_or_aliases(modname, code, canonical_name, data, "otherNames")
	end
	if data.aliases then
		check_other_names_or_aliases(modname, code, canonical_name, data, "aliases")
	end
	if data.varieties then
		check_other_names_or_aliases(modname, code, canonical_name, data, "varieties", true)
	end
end

local function validate_pattern(pattern, modname, code, data, standardChars)
	if type(pattern) ~= "string" then
		discrepancy(modname, "\"%s\", the %spattern for %s (<code>%s</code>), is not a string.",
			pattern, standardChars and "standard character " or "", code, data[1])
	end
	local ranges
	for lower, higher in ugmatch(pattern, "(.)%-%%?(.)") do
		if codepoint(lower) >= codepoint(higher) then
			ranges = ranges or Array()
			insert(ranges, { lower, higher })
		end
	end
	if ranges and ranges[1] then
		local plural = #ranges ~= 1 and "s" or ""
		discrepancy(modname, "%s (<code>%s</code>) specifies an invalid pattern " ..
			"for %scharacter detection: <code>\"%s\"</code>. The first codepoint%s " ..
			"in the range%s %s %s must be less than the second.",
			link(data), code, standardChars and "standard " or "", pattern, plural, plural,
			ranges
				:map(
					function(range)
						return range[1] .. "-" .. range[2] .. (" (U+%X, U+%X)")
							:format(codepoint(range[1]), codepoint(range[2]))
					end)
				:concat(", "),
			#ranges ~= 1 and "are" or "is")
	end
	if not pcall(umatch, "", "[" .. pattern .. "]") then
		discrepancy(modname, "%s (<code>%s</code>) specifies an invalid pattern for " ..
			(standardChars and "standard" or "") .. " character detection: <code>\"%s\"</code>",
			link(data), code, pattern)
	end
end

local remove_exceptions_addition = 0xF0000
local maximum_code_point = 0x10FFFF
local remove_exceptions_maximum_code_point = maximum_code_point - remove_exceptions_addition

local function check_entry_name_or_sortkey(modname, code, data, replacements_name)
	local canonical_name = data[1]
	
	local replacements = data[replacements_name]
	if type(replacements) == "string" then
		if not (replacements_name == "sort_key" or replacements_name == "entry_name") then
			discrepancy(modname, "The %s field in the data table for %s (<code>%s</code>) must be a table.",
				replacements_name, canonical_name, code)
		end
		return
	end
	
	if (replacements.from ~= nil) ~= (replacements.to ~= nil) then
		discrepancy(modname,
			"The <code>from</code> and <code>to</code> arrays in the <code>%s</code> table for %s (<code>%s</code>) are not both defined or both undefined.",
			replacements_name, canonical_name, code)
	elseif replacements.from then
		for _, key in ipairs { "from", "to" } do
			check_array(modname, code, canonical_name, data, key, replacements_name)
		end
	end
	
	if replacements.remove_diacritics and type(replacements.remove_diacritics) ~= "string" then
		discrepancy(modname,
			"The <code>remove_diacritics</code> field in the <code>%s</code> table for %s (<code>%s</code>) table must be a string.",
			replacements_name, canonical_name, code)
	end
	
	if replacements.remove_exceptions then
		if check_array(modname, code, canonical_name, data, "remove_exceptions", replacements_name) then
			for sequence_i, sequence in ipairs(replacements.remove_exceptions) do
				local code_point_i = 0
				for code_point in gcodepoint(sequence) do
					code_point_i = code_point_i + 1
					if code_point > remove_exceptions_maximum_code_point then
						discrepancy(modname,
							"Code point #%d (0x%04X) in field #%d of the <code>remove_exceptions</code> array for %s (<code>%s</code>) is over U+%04X.",
							code_point_i, code_point, sequence_i, canonical_name, code, remove_exceptions_maximum_code_point)
					end
					
				end
			end
		end
	end
	
	if replacements.from and replacements.to
			and m_table.length(replacements.to) > m_table.length(replacements.from) then
		discrepancy(modname,
			"The <code>from</code> array in the <code>%s</code> table for %s (<code>%s</code>) must be shorter or the same length as the <code>to</code> array.",
			replacements_name, canonical_name, code)
	end
end

do
	local function has_ancestor(lang, code)
		for _, anc in ipairs(lang:getAncestors()) do
			if code == anc:getCode() or has_ancestor(anc, code) then
				return true
			end
		end
	end
	
	local function get_default_ancestors(lang)
		if lang:hasType("etymology-only") then
			local parent = lang:getParent()
			if not has_ancestor(parent, lang:getCode()) then
				return parent:getAncestorCodes()
			end
		end
		local fam_code, def_anc = lang:getFamilyCode()
		while fam_code and fam_code ~= "qfa-not" do
			local fam = m_family_data[fam_code]
			def_anc = fam.protoLanguage or
				m_language_data[fam_code .. "-pro"] and fam_code .. "-pro" or
				m_etym_language_data[fam_code .. "-pro"] and fam_code .. "-pro"
			if def_anc and def_anc ~= lang:getCode() then
				return {def_anc}
			end
			fam_code = fam[3]
		end
	end
	
	local function iterate_ancestor(code, data, modname, anc_code, lang)
		local anc = get_lang(anc_code, nil, true)
		if not anc then
			discrepancy(modname,
				"%s (<code>%s</code>) lists the invalid language code <code>%s</code> as its ancestor.",
				link(data), code, anc_code)
			return
		end
		local anc_fam = anc:getFamily()
		if not anc_fam then
			discrepancy(modname,
				"%s has no family.",
				anc_code)
			return
		end
		local anc_fam_code = anc_fam:getCode()
		local def_ancs = get_default_ancestors(lang)
		if def_ancs then
			for _, def_anc in ipairs(def_ancs) do
				def_anc = get_lang(def_anc, nil, true)
				if def_anc and (
					anc_code == def_anc:getCode() or
					has_ancestor(def_anc, anc_code) or
					def_anc:hasParent(anc_code) and not has_ancestor(anc, def_anc:getCode())
				) then
					discrepancy(modname,
						"%s (<code>%s</code>) has the %s (<code>%s</code>) listed in its ancestor field, which is redundant, since it is determined to be ancestral automatically.",
						link(data), code,
						link(anc:getRawData()), anc_code)
				end
			end
		end
		if not lang:inFamily(anc_fam_code) then
			discrepancy(modname,
				"%s (<code>%s</code>) has %s (<code>%s</code>) set as an ancestor, but is not in the %s (<code>%s</code>).",
				link(data), code,
				link(anc:getRawData()), anc_code,
				link(anc_fam:getRawData()), anc_fam_code)
		end
		local fam, proto = lang
		repeat
			fam = fam:getFamily()
			proto = fam and fam:getProtoLanguage()
		until proto or not fam or fam:getCode() == "qfa-not"
		if proto and not (
			proto:getCode() == anc:getCode() or
			proto:hasAncestor(anc:getCode()) or
			anc:hasAncestor(proto:getCode())
		) then
			local fam = lang:getFamily()
			discrepancy(modname,
				"%s (<code>%s</code>) is in the %s (<code>%s</code>) and has %s (<code>%s</code>) set as an ancestor, but it is not possible to form an ancestral chain between them.",
				link(data), code,
				link(fam:getRawData()), fam:getCode(),
				link(anc:getRawData()), anc_code)
		end
	end
	
	function export.check_ancestors(code, data, modname)
		local ancestors = data.ancestors
		if not ancestors then
			return
		elseif type(ancestors) == "string" then
			ancestors = split(ancestors, "%s*,%s*", true)
		end
		local lang = get_lang(code, nil, true)
		for _, anc in ipairs(ancestors) do
			iterate_ancestor(code, data, modname, anc, lang)
		end
	end
end
	
local function check_code_to_name_and_name_to_code_maps(
		source_module_type,
		source_module_description,
		code_to_module_map, name_to_code_map,
		code_to_name_modname, code_to_name_module,
		name_to_code_modname, name_to_code_module)
	local aliases = require("Module:languages/data").aliases
	local function check_code_and_name(modname, code, canonical_name)
		-- Check the code is in code_to_module_map and that it didn't originate from the wrong data module.
		local check_mod = code_to_module_map[code] or code_to_module_map[aliases[code]]
		if not (check_mod and check_mod:match("^" .. source_module_type .. "/data")) then
			if not name_to_code_map[canonical_name] then
				discrepancy(modname,
					"The code <code>%s</code> and the canonical name %s should be removed; they are not found in %s.",
					code, canonical_name, source_module_description)
			else
				discrepancy(modname,
					"<code>%s</code>, the code for the canonical name %s, is wrong; it should be <code>%s</code>.",
					code, canonical_name, name_to_code_map[canonical_name])
			end
		elseif not name_to_code_map[canonical_name] then
			local data_table = require("Module:" .. code_to_module_map[code])[code]
			discrepancy(modname,
				"%s, the canonical name for the code <code>%s</code>, is wrong; it should be %s.",
				canonical_name, code, data_table[1])
		end
	end

	for code, canonical_name in pairs(code_to_name_module) do
		check_code_and_name(code_to_name_modname, code, canonical_name)
	end
	
	for canonical_name, code in pairs(name_to_code_module) do
		check_code_and_name(name_to_code_modname, code, canonical_name)
	end
end

local function check_extraneous_extra_data(
		data_modname, data_module, extra_data_modname, extra_data_module)
	for code, _ in pairs(extra_data_module) do
		if not data_module[code] then
			discrepancy(extra_data_modname,
				"Language code <code>%s</code> is not found in [[Module:%s]], and should be removed from [[Module:%s]].",
				code, data_modname, extra_data_modname
			)
		end
	end
end

-- Just trying to not have a module error when someone puts a script code
-- in the position of a language code.
local function show_family_code(code)
	if type(code) == "string" then
		return "<code>" .. code .. "</code>"
	else
		return require("Module:debug").highlight_dump(code)
	end
end

local function check_languages()
	local check_language_data_keys = check_data_keys{
		1, 2, 3, 4, -- canonical name, wikidata item, family, scripts
		"display_text", "generate_forms", "entry_name", "sort_key",
		"otherNames", "aliases", "varieties", "ietf_subtag",
		"type", "ancestors",
		"wikimedia_codes", "wikipedia_article", "standardChars",
		"translit", "override_translit", "link_tr",
		"dotted_dotless_i"
	}
	
	local function check_language(modname, code, data, mainData, extraData)
		local canonical_name, lang_type = data[1], data.type
		
		check_language_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
		else
			if not m_language_codes[code] then
				discrepancy("languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
			end
			all_codes[code] = modname
		end
		
		if code:sub(-4) == "-pro" then
			local fam_code = code:sub(1, -5)
			local fam = get_lang(fam_code, nil, true, true)
			if not fam then
				discrepancy(modname,
					"%s (<code>%s</code>) has a proto-language code associated with the invalid code <code>%s</code>.",
					link(data), code, fam_code)
			elseif not fam:hasType("family") then
				discrepancy(modname,
					"%s (<code>%s</code>) has a proto-language code associated with %s (<code>%s</code>), which is not a family.",
					link(data), code, fam:getCanonicalName(), fam_code)
			else
				local expected_name = "Proto-" .. fam:getCanonicalName()
				if canonical_name ~= expected_name then
					discrepancy(modname,
						"%s (<code>%s</code>) does not have the expected name \"%s\", even though it is the proto-language of the %s (<code>%s</code>).",
						link(data), code, expected_name, fam:getCategoryName(), fam_code)
				end
			end
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[canonical_name] then
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
				link(data), code, language_names[canonical_name])
		else
			if not m_language_canonical_names[canonical_name] then
				discrepancy("languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
			end
			language_names[canonical_name] = code
		end
		
		check_wikidata_item(modname, code, data, 2)

		if extraData then
			check_other_names_aliases_varieties(modname, code, canonical_name, extraData)
		end
		
		if lang_type and not (lang_type == "regular" or lang_type == "reconstructed" or lang_type == "appendix-constructed") then
			discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(data), code, data.type)
		end
		
		if mainData.aliases then
			discrepancy(modname, "%s (<code>%s</code>) has the <code>aliases</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(data), code)
		end
		
		if mainData.varieties then
			discrepancy(modname, "%s (<code>%s</code>) has the <code>varieties</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(data), code)
		end
		
		if mainData.otherNames then
			discrepancy(modname, "%s (<code>%s</code>) has the <code>otherNames</code> key. This must be moved to [[Module:" .. modname .. "/extra]].", link(data), code)
		end
		
		if not extraData then
			discrepancy(modname .. "/extra", "%s (<code>%s</code>) has data in [[Module:" .. modname .. "]], but does not have corresponding data in [[Module:" .. modname .. "/extra]].", link(data), code)
		--elseif extraData.otherNames then
		--	discrepancy(modname .. "/extra", "%s (<code>%s</code>) has <code>otherNames</code> key, but these should be changed to either <code>aliases</code> or <code>varieties</code>.", link(data), code)
		end
		
		local sc = data[4]
		if sc then
			if type(sc) == "string" then
				sc = split(sc, "%s*,%s*", true)
			end
			if type(sc) == "table" then
				if not sc[1] then
					discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(data), code)
				else
					for _, sccode in ipairs(sc) do
						local cur_sc = m_script_data[sccode]
						if not (cur_sc or sccode == "All" or sccode == "Hants") then
							discrepancy(modname,
								"%s (<code>%s</code>) lists the invalid script code <code>%s</code>.",
								link(data), code, sccode)
						-- elseif not cur_sc.characters then
						-- 	discrepancy(modname,
						-- 		"%s (<code>%s</code>) lists a script without characters <code>%s</code> (%s).",
						-- 		link(data), code, sccode, cur_sc[1])
						end
			
						nonempty_scripts[sccode] = true
					end
				end
			else
				discrepancy(modname,
					"The %s field for %s (<code>%s</code>) must be a table or string.",
					4, link(data), code)
			end
		end
		
		if data.ancestors then
			export.check_ancestors(code, data, modname)
		end
		
		if data[3] then
			local family = data[3]
			if not m_family_data[family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has the invalid family code %s.",
					link(data), code, show_family_code(family))
			end
			
			nonempty_families[family] = true
		end
		
		if data.sort_key then
			check_entry_name_or_sortkey(modname, code, data, "sort_key")
		end
		
		if data.entry_name then
			check_entry_name_or_sortkey(modname, code, data, "entry_name")
		end

		if data.display then
			check_entry_name_or_sortkey(modname, code, data, "display")
		end

		if data.standardChars then
			if type(data.standardChars) == "table" then
				local sccodes = {}
				for _, sccode in ipairs(sc) do
					sccodes[sccode] = true
				end
				for sccode in pairs(data.standardChars) do
					if not (sccodes[sccode] or sccode == 1) then
						discrepancy(modname, "The field %s in the standardChars table for %s (<code>%s</code>) does not match any script for that language.",
							sccode, link(data), code)
					end
				end
			elseif data.standardChars and type(data.standardChars) ~= "string" then
				discrepancy(modname, "The standardChars field in the data table for %s (<code>%s</code>) must be a string or table.",
					link(data), code)
			end
		end
		
		check_true_or_string_or_nil(modname, code, data, "override_translit")
		check_true_or_string_or_nil(modname, code, data, "link_tr")
		
		if data.override_translit and not data.translit then
			discrepancy(modname,
				"%s (<code>%s</code>) has <code>override_translit</code> set, but no transliteration module",
				link(data), code)
		end
	end
	
	local function check_module(modname, test)
		local mod_data = mw.loadData("Module:" .. modname)
		local extra_modname = modname .. "/extra"
		local extra_mod_data = mw.loadData("Module:" .. extra_modname)
		for code, data in pairs(mod_data) do
			test(modname, code, data)
			check_language(modname, code, data, mod_data[code], extra_mod_data[code])
		end
		check_no_alias_codes(modname, mod_data)
		check_no_alias_codes(extra_modname, extra_mod_data)
		check_extraneous_extra_data(modname, mod_data, extra_modname, extra_mod_data)
	end
	
	-- Check two-letter codes
	check_module(
		"languages/data/2",
		function(modname, code, data)
			if not code:find("^[a-z][a-z]$") then
				discrepancy(modname, "%s (<code>%s</code>) does not have a two-letter code.", link(data), code)
			end
		end
	)
	
	-- Check three-letter codes
	for i = 0x61, 0x7A do -- a to z
		local letter = string.char(i)
		check_module(
			"languages/data/3/" .. letter,
			function(modname, code, data)
				if not code:find("^" .. letter .. "[a-z][a-z]$") then
					discrepancy(modname,
						"%s (<code>%s</code>) does not have a three-letter code starting with \"<code>%s</code>\".",
						link(data), code, letter)
				end
			end
		)
	end
	
	-- Check exceptional codes
	check_module(
		"languages/data/exceptional",
		function(modname, code, data)
			if code:find("^[a-z][a-z][a-z]?$") then
				discrepancy(modname, "%s (<code>%s</code>) has a two- or three-letter code.", link(data), code)
			end
		end
	)
	
	-- These checks must be done while all_codes only contains language codes:
	-- that is, after language data modules have been processed, but before
	-- etymology languages, families, and scripts have.
	check_code_to_name_and_name_to_code_maps(
		"languages",
		"a submodule of [[Module:languages]]",
		all_codes, language_names,
		"languages/code to canonical name", m_language_codes,
		"languages/canonical names", m_language_canonical_names
	)
	
	-- Check [[Template:langname-lite]]
	local frame = mw.getCurrentFrame()
	local content = new_title("Template:langname-lite"):getContent()
	content = content:gsub("%<%!%-%-.-%-%-%>", "") -- remove comments
	local match = ugmatch(content, "\n\t*|#*([^\n]+)=([^\n]*)")
	while true do
		local code, name = match()
		if not code then return "OK" end
		if code:len() > 1 and code ~= "default" then
			for _, code in pairs(split(code, "|", true)) do
				local lang = get_lang(code, nil, true, true)
				if name:match("etymcode") then
					local nonEtym_name = frame:preprocess(name)
					local nonEtym_real_name = lang:getFullName()
					if nonEtym_name ~= nonEtym_real_name then
						discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. nonEtym_name .. ". Expected name: " .. nonEtym_real_name .. ".")
					end
					name = frame:preprocess(name:gsub("{{{allow etym|}}}", "1"))
				elseif name:match("familycode") then
					name = name:match("familycode|(.-)|")
				else
					name = name
				end
				if not lang then
					discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. name .. ". Language not present in data.")
				else
					local real_name = lang:getCanonicalName()
					if name ~= real_name then
						discrepancy("Template:langname-lite", "Code: <code>" .. code .. "</code>. Saw name: " .. name .. ". Expected name: " .. real_name .. ".")
					end
				end
			end
		end
	end
end

local function check_etym_languages()
	local modname = "etymology languages/data"
	
	local check_etymology_language_data_keys = check_data_keys{
		1, 2, 3, 4, 5, -- canonical name, wikidata item, family, scripts, parent
		"display_text", "generate_forms", "entry_name", "sort_key",
		"otherNames", "aliases", "varieties", "ietf_subtag",
		"type", "main_code", "ancestors",
		"wikimedia_codes", "wikipedia_article", "standardChars",
		"translit", "override_translit", "link_tr",
		"dotted_dotless_i"
	}
	
	for code, data in pairs(m_etym_language_data) do
		local canonical_name, parent =
			data[1], data[5]
		check_etymology_language_data_keys(modname, code, data)
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
		else
			if not m_etym_language_codes[code] then
				discrepancy("etymology languages/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
			end
			all_codes[code] = modname
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[canonical_name] then
			local m_canonical_lang = m_languages.getByCanonicalName(canonical_name, nil, true)
			if not m_canonical_lang then
				discrepancy(modname, "%s (<code>%s</code>) has a canonical name that cannot be looked up.",
					link(data), code)
			elseif data.main_code ~= m_canonical_lang:getCode() then
				discrepancy(modname,
					"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
					link(data), code, language_names[canonical_name])
			end
		else
			if not m_etym_language_canonical_names[canonical_name] then
				discrepancy("etymology languages/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
			end
			etym_language_names[canonical_name] = code
		end
		
		check_other_names_aliases_varieties(modname, code, canonical_name, data)
		
		if parent then
			if type(parent) ~= "string" then
				discrepancy(modname,
					"Etymology-only %s (<code>%s</code>) has a parent language or family code that is %s rather than a string.",
					link(data), code, parent == nil and "nil" or "a " .. type(parent))
			elseif not (m_language_data[parent] or m_family_data[parent] or m_etym_language_data[parent]) then
				discrepancy(modname,
					"Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.",
					link(data), code, parent)
			end
			nonempty_families[parent] = true
		else
			discrepancy(modname,
				"Etymology-only %s (<code>%s</code>) has no parent language or family code.",
				link(data), code)
		end
		
		if data.ancestors then
			export.check_ancestors(code, data, modname)
		end
		
		if data[3] then
			local family = data[3]
			if not m_family_data[family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has the invalid family code %s.",
					link(data), code, show_family_code(family))
			end
			nonempty_families[family] = true
		end
		
		check_wikidata_item(modname, code, data, 2)
	end

	local checked = {}
	for code, data in pairs(m_etym_language_data) do
		local stack = {}

		while data do
			if checked[data] then
				break	
			end
			if stack[data] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data), code,
					link(m_etym_language_data[data[5]]), data.parent or data[5]
				)
				break
			end
			stack[data] = true
			code, data = data[5], data[5] and m_etym_language_data[data[5]]
		end
		
		for data in pairs(stack) do
			checked[data] = true	
		end
	end
	
	check_no_alias_codes(modname, m_etym_language_data)
	
	check_code_to_name_and_name_to_code_maps(
		"etymology languages",
		"[[Module:etymology languages/data]]",
		all_codes, etym_language_names,
		"etymology languages/code to canonical name", m_etym_language_codes,
		"etymology languages/canonical names", m_etym_language_canonical_names)
end

local function check_families()
	local modname = "families/data"
	
	local check_family_data_keys = check_data_keys{
		1, 2, 3, -- canonical name, wikidata item, (parent) family
		"type", "ietf_subtag",
		"protoLanguage", "otherNames", "aliases", "varieties",
	}
	
	for code, data in pairs(m_family_data) do
		check_family_data_keys(modname, code, data)
		
		local canonical_name, family, protolang = data[1], data[3], data.protoLanguage
		
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique; it is also defined in [[Module:%s]].", code, all_codes[code])
		else
			if not m_family_codes[code] then
				discrepancy("families/code to canonical name", "The code <code>%s</code> (%s) is missing.", code, canonical_name)
			end
			all_codes[code] = modname
		end
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif family_names[canonical_name] then
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
				link(data), code, family_names[canonical_name])
		else
			if not m_family_canonical_names[canonical_name] then
				discrepancy("families/canonical names", "The canonical name %s (<code>%s</code>) is missing.", canonical_name, code)
			end
			family_names[canonical_name] = code
		end
		
		if data[2] and type(data[2]) ~= "number" then
			discrepancy(modname, "%s (<code>%s</code>) has a wikidata item value that is not a number or <code>nil</code>: %s", link(data), code, dump(data[2]))
		end
		
		check_other_names_aliases_varieties(modname, code, canonical_name, data)
		
		if family then
			if family == code and code ~= "qfa-not" then
				discrepancy(modname,
					"%s (<code>%s</code>) has itself as its family.",
					link(data), code)
			elseif not m_family_data[family] then
				discrepancy(modname,
					"%s (<code>%s</code>) has the invalid parent family code %s.",
					link(data), code, show_family_code(family))
			end
			
			nonempty_families[family] = true
		end
		
		if protolang then
			local protolang_obj = get_lang(protolang, nil, true)
			if not protolang_obj then
				discrepancy(modname,
					"%s (<code>%s</code>) has the invalid proto-language code <code>%s</code>.",
					canonical_name, code, protolang)
			elseif protolang == code .. "-pro" then
				discrepancy(modname,
					"%s (<code>%s</code>) has %s (<code>%s</code>) listed as its proto-language, which is redundant, since it is determined to be the proto-language automatically.",
					canonical_name, code,
					protolang_obj:getCanonicalName(), protolang)
			elseif protolang:sub(-4) == "-pro" then
				discrepancy(modname,
					"%s (<code>%s</code>) has %s (<code>%s</code>) listed as its proto-language, which is supposed to be the proto-language for the family <code>%s</code>.",
					canonical_name, code,
					protolang_obj:getCanonicalName(), protolang, protolang:sub(1, -5))
			end
		end
		
		check_wikidata_item(modname, code, data, 2)
	end
	
	for code, data in pairs(m_family_data) do
		if not (nonempty_families[code] or allowed_empty_families[code]) then
			discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data), code)
		end
	end

	local checked = { ["qfa-not"] = true }
	for code, data in pairs(m_family_data) do
		local stack = {}

		while data do
			if checked[code] then
				break	
			end
			if stack[code] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data), code,
					link(m_family_data[data[3]]), data[3]
				)
				break
			end
			stack[code] = true
			code, data = data[3], m_family_data[data[3]]
		end
		
		for code in pairs(stack) do
			checked[code] = true	
		end
	end
	
	check_no_alias_codes(modname, m_family_data)
	
	check_code_to_name_and_name_to_code_maps(
		"families",
		"[[Module:families/data]]",
		all_codes, family_names,
		"families/code to canonical name", m_family_codes,
		"families/canonical names", m_family_canonical_names)
end

local function check_scripts()
	local modname = "scripts/data"
	
	local check_script_data_keys = check_data_keys({
		1, 2, -- canonical name, writing systems
		"canonicalName", "otherNames", "aliases", "varieties", "parent", "ietf_subtag",
		"wikipedia_article", "ranges", "characters", "spaces", "capitalized", "translit", "direction",
		"character_category", "normalizationFixes"
	}, true)
	
	local m_script_codes = require("Module:scripts/code to canonical name")
	local m_script_canonical_names = require("Module:scripts/by name")
	
	-- Just to satisfy requirements of check_code_to_name_and_name_to_code_maps.
	local script_code_to_module_map = {}
	
	for code, data in pairs(m_script_data) do
		local canonical_name = data[1]
		if not m_script_codes[code] and #code == 4 then
			discrepancy("scripts/code to canonical name", "<code>%s</code> (%s) is missing", code, canonical_name)
		end
		
		check_script_data_keys(modname, code, data)
		
		if not canonical_name then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif script_names[canonical_name] then
			--[=[
			discrepancy(modname,
				"%s (<code>%s</code>) has a canonical name that is not unique; it is also used by the code <code>%s</code>.",
				link_script(data.names[1]), code, script_names[data.names[1]])
			--]=]
		else
			if not m_script_canonical_names[canonical_name] and #code == 4 then
				discrepancy("scripts/by name", "%s (<code>%s</code>) is missing", canonical_name, code)
			end
			script_names[canonical_name] = code
		end
		
		check_other_names_aliases_varieties(modname, code, canonical_name, data)
		
		if not nonempty_scripts[code] then
			discrepancy(modname,
				"%s (<code>%s</code>) is not used by any language%s.",
				link_script(canonical_name), code, data.characters and ""
					or " and has no characters listed for auto-detection")
		--[[
		elseif not data.characters then
			discrepancy(modname, "%s (<code>%s</code>) has no characters listed for auto-detection.", link_script(canonical_name), code)
		--]]
		end

		if data.characters then
			validate_pattern(data.characters, modname, code, data, false)
		end
		
		script_code_to_module_map[code] = modname
	end
	
	check_no_alias_codes(modname, m_script_data)
	
	check_code_to_name_and_name_to_code_maps(
		"scripts",
		"a submodule of [[Module:scripts]]",
		script_code_to_module_map, script_names,
		"scripts/code to canonical name", m_script_codes,
		"scripts/by name", m_script_canonical_names)
end

-- FIXME: this is quite messy.
local function check_wikidata_languages()
	local data = mw.text.jsonDecode(new_title("Module:languages/data/wikidata.json"):getContent())
	
	local seen = {{}, {}, {}, [5] = {}}
	for _, item in ipairs(data) do
		local id = item.id
		for k, v in pairs(item) do
			if k ~= "id" then
				local _seen = seen[k]
				for i, code in ipairs(v) do
					local _code = code[1]
					local _type = type(_seen[_code])
					if _type == "table" then
						insert(_seen[_code], id)
					elseif _type == "string" then
						_seen[_code] = {_seen[_code], id}
					else
						_seen[_code] = id
					end
				end
			end
		end
	end
	
	for k, v in pairs(seen) do
		for code, ids in pairs(v) do
			if type(ids) == "table" then
				local t = {}
				for i, id in ipairs(ids) do
					t[i] = ("<code>[[d:%s|%s]]</code>"):format(id, id)
				end
				discrepancy("languages/data/wikidata.json", "<code>%s</code> is set as an ISO 639-%d code on multiple items: %s.",
					code, k, list_to_text(t))
				
			end
		end
	end
end

local function check_labels()
	local check_label_data_keys = check_data_keys{
		"display", "Wikipedia", "glossary",
		"plain_categories", "topical_categories", "pos_categories", "regional_categories", "sense_categories",
		"omit_preComma", "omit_postComma", "omit_preSpace",
		"deprecated", "track"
	}
	
	local function check_label(modname, code, data)
		local _type = type(data)
		if _type == "table" then
			check_label_data_keys(modname, code, data)
		elseif _type ~= "string" then
			discrepancy(modname,
				"The data for label <code>%s</code> is a %s; only tables and strings are allowed.",
				code, _type)
		end
	end
	
	for _, module in ipairs{"", "/regional", "/topical"} do
		local modname = "Module:labels/data" .. module
		module = require(modname)
		for label, data in pairs(module) do
			check_label(modname, label, data)
		end
	end
	
	for code in pairs(m_language_codes) do
		local modname = "Module:labels/data/lang/" .. code
		local ok, module = pcall(require, modname)
		if ok then
			for label, data in pairs(module) do
				check_label(modname, label, data)
			end
		end
	end
end

local function check_zh_trad_simp()
	local m_ts = require("Module:zh/data/ts")
	local m_st = require("Module:zh/data/st")
	local ruby = require("Module:ja-ruby").ruby_auto
	local lang = get_lang("zh")
	local Hant = m_scripts.getByCode("Hant")
	local Hans = m_scripts.getByCode("Hans")
	
	local data = {[0] = m_st, m_ts}
	local mod = {[0] = "st", "ts"}
	local var = {[0] = "Simp.", "Trad."}
	local sc = {[0] = Hans, Hant}
	
	local function find_stable_loop(chars, other, j)
		local display = ruby({["markup"] = "[" .. other .. "](" .. var[(j+1)%2] .. ")"})
		display = m_links.language_link{term = other, alt = display, lang = lang, sc = sc[(j+1)%2], tr = "-"}
		insert(chars, display)
		if data[(j+1)%2][other] == other then
			insert(chars, other)
			return chars, 1
		elseif not data[(j+1)%2][other] then
			insert(chars, "not found")
			return chars, 2
		elseif data[j%2][data[(j+1)%2][other]] ~= other then
			return find_stable_loop(chars, data[(j+1)%2][other], j + 1)
		else
			local display = ruby({["markup"] = "[" .. data[(j+1)%2][other] .. "](" .. var[j%2] .. ")"})
			display = m_links.language_link{term = data[(j+1)%2][other], alt = display, lang = lang, sc = sc[j%2], tr = "-"}
			insert(chars, display .. " (")
			display = ruby({["markup"] = "[" .. data[j%2][data[(j+1)%2][other]] .. "](" .. var[(j+1)%2] .. ")"})
			display = m_links.language_link{term = data[j%2][data[(j+1)%2][other]], alt = display, lang = lang, sc = sc[(j+1)%2], tr = "-"}
			insert(chars, display .. " etc.)")
			return chars, 3
		end
		
		return chars
	end
	
	for i = 0, 1, 1 do
		for char, other in pairs(data[i]) do
			if data[(i+1)%2][other] ~= char then
				local chars, issue = {}
				local display = ruby({["markup"] = "[" .. char .. "](" .. var[i] .. ")"})
				display = m_links.language_link{term = char, alt = display, lang = lang, sc = sc[i], tr = "-"}
				insert(chars, display)
				chars, issue = find_stable_loop(chars, other, i)
				if issue == 1 or issue == 2 then
					local sc_this, mod_this, j = {}
					if chars[#chars-1]:match(var[(i+1)%2]) then
						j = 1
					else
						j = 0
					end
					mod_this = mod[(i+j)%2]
					sc_this = {[0] = sc[(i+j)%2], sc[(i+j+1)%2]}
					for k, char in ipairs(chars) do
						chars[k] = m_script_utils.tag_text(char, lang, sc_this[k%2], "term")
					end
					if issue == 1 then
						discrepancy("zh/data/" .. mod_this, "character references itself: " .. concat(chars, " → "))
					elseif issue == 2 then
						discrepancy("zh/data/" .. mod_this, "missing character: " .. concat(chars, " → "))
					end
				elseif issue == 3 then
					for j, char in ipairs(chars) do
						chars[j] = m_script_utils.tag_text(char, lang, sc[(i+j)%2], "term")
					end
					discrepancy("zh/data/" .. mod[i], "possible mismatched character: " .. concat(chars, " → "))
				end
			end
		end
	end
end

local function check_serialization(modname)
	local serializers = {
		["Hani-sortkey/data/serialized"] = "Hani-sortkey/serializer",
	}
	
	if not serializers[modname] then
		return nil
	end
	
	local serializer = serializers[modname]
	local current_data = require("Module:" .. serializer).main(true)
	local stored_data = require("Module:" .. modname)
	if current_data ~= stored_data then
		discrepancy(modname, "<strong><u>Important!</u> Serialized data is out of sync. Use [[Module: ".. serializer .. "]] to update it. If you have made any changes to the underlying data, the serialized data <u>must</u> be updated before these changes will take effect.</strong>")
	end
end

-- Warning: cannot be called twice in the same module invocation because
-- some module-global variables are not reset between calls.
function export.do_checks(modules)
	messages = setmetatable({}, {
		__index = function (self, k)
			local val = Array()
			self[k] = val
			return val
		end
	})
	
	if modules["zh/data/ts"] or modules["zh/data/st"] then
		check_zh_trad_simp()
	end
	check_languages()
	check_etym_languages()

	-- families and scripts must be checked AFTER languages; languages checks fill out
	-- the nonempty_families and nonempty_scripts tables, used for testing if a family/script
	-- is ever used in the data
	check_families()
	check_scripts()
	
	check_wikidata_languages()
	
	if modules["labels/data"] then
		check_labels()
	end
	
	for module in pairs(modules) do
		check_serialization(module)
	end
	
	setmetatable(messages, nil)
	
	local function find_code(message)
		return string.match(message, "<code>([^<]+)</code>")
	end
	
	find_code = require("Module:fun").memoize(find_code)
	
	local function comp(message1, message2)
		local code1, code2 = find_code(message1), find_code(message2)
		if code1 and code2 then
			return code1 < code2
		else
			return message1 < message2
		end
	end
	
	for _, msglist in pairs(messages) do
		msglist:sort(comp)
	end
	
	local ret = messages
	messages = nil
	return ret
end

function export.format_message(modname, msglist)
	local header; if modname:match("^Module:") or modname:match("^Template:") then
		header = "===[[" .. modname .. "]]==="
	else
		header = "===[[Module:" .. modname .. "]]==="
	end
	return header
		.. msglist
			:map(
				function(msg)
					return "\n* " .. msg
				end)
			:concat()
end

function export.check_modules(args)
	
	local modules = {}
	for _, arg in ipairs(args) do
		modules[arg] = true
	end
	
	local ret = Array()
	local messages = export.do_checks(modules)
	
	for _, module in ipairs(args) do
		local msglist = messages[module]
		if msglist then
			ret:insert(export.format_message(module, msglist))
		end
	end
	return ret:concat("\n")
end

function export.check_modules_t(frame)
	local args = m_table.shallowcopy(frame.args)
	return export.check_modules(args)
end

function export.perform(frame)
	local messages = export.do_checks({})
	
	-- Format the messages
	local ret = Array()
	for modname, msglist in m_table.sortedPairs(messages) do
		ret:insert(export.format_message(modname, msglist))
	end
	
	-- Are there any messages?
	if i == 1 then
		return "<b class=\"success\">Glory to Arstotzka.</b>"
	else
		ret:insert(1, "<b class=\"warning\">Discrepancies detected:</b>")
		
		return ret:concat("\n")
	end
end

return export