יחידה:headword

ניתן ליצור תיעוד על היחידה הזאת בדף יחידה:headword/תיעוד
local export = {}
local m_string_utils = require("Module:string utilities")

local rfind = m_string_utils.find
local rgmatch = m_string_utils.gmatch
local rsubn = m_string_utils.gsub
local ulen = mw.ustring.len
local unfc = mw.ustring.toNFC
local uupper = mw.ustring.upper

local m_data = mw.loadData("Module:headword/data")

local isLemma = m_data.lemmas
local isNonLemma = m_data.nonlemmas
local notranslit = m_data.notranslit
local toBeTagged = m_data.toBeTagged

-- If set to true, categories always appear, even in non-mainspace pages
local test_force_categories = false

local modules = {}

local function get_module(mod)
	if not modules[mod] then
		modules[mod] = require("Module:" .. mod)
	end
	return modules[mod]
end

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- Add a tracking category to track entries with certain (unusually undesirable) properties. `track_id` is an identifier
-- for the particular property being tracked and goes into the tracking page. Specifically, this adds a link in the
-- page text to [[Template:tracking/headword/TRACK_ID]], meaning you can find all entries with the `track_id` property
-- by visiting [[Special:WhatLinksHere/Template:tracking/headword/TRACK_ID]].
--
-- If `code` (a language or script code) is given, an additional tracking page
-- [[Template:tracking/headword/TRACK_ID/CODE]] is linked to, and you can find all entries in the combination of
-- `track_id` and `code` by visiting [[Special:WhatLinksHere/Template:tracking/headword/TRACK_ID/CODE]]. This makes it
-- possible to isolate only the entries with a specific tracking property that are in a given language or script.
local function track(track_id, code)
	local tracking_page = "headword/" .. track_id
	local m_debug_track = get_module("debug/track")
	if code then
		m_debug_track{tracking_page, tracking_page .. "/" .. code}
	else
		m_debug_track(tracking_page)
	end
	return true
end


local function text_in_script(text, script_code)
	local sc = get_module("scripts").getByCode(script_code)
	if not sc then
		error("Internal error: Bad script code " .. script_code)
	end
	local characters = sc:getCharacters()

	local out
	if characters then
		text = rsub(text, "%W", "")
		out = rfind(text, "[" .. characters .. "]")
	end

	if out then
		return true
	else
		return false
	end
end


local spacingPunctuation = "[%s%p]+"
--[[ List of punctuation or spacing characters that are found inside of words.
	 Used to exclude characters from the regex above. ]]
local wordPunc = "-־׳״'.·*’་•:"
local notWordPunc = "[^" .. wordPunc .. "]+"


-- Format a term (either a head term or an inflection term) along with any left or right qualifiers, references or
-- customized separator: `part` is the object specifying the term, which should optionally contain:
-- * left qualifiers in `q`, an array of strings (or `qualifiers` for compatibility purposes);
-- * right qualifiers in `qq`, an array of strings;
-- * references in `refs`, an array either of strings (formatted reference text) or objects containing fields `text`
--   (formatted reference text) and optionally `name` and/or `group`;
-- * a separator in `separator`, defaulting to " <i>or</i> " if this is not the first term (j > 1), otherwise "".
-- `formatted` is the formatted version of the term itself, and `j` is the index of the term.
local function format_term_with_qualifiers_and_refs(part, formatted, j)
	local left_qualifiers, right_qualifiers
	local reftext

	left_qualifiers = part.q and #part.q > 0 and part.q
	if left_qualifiers then
		left_qualifiers = get_module("qualifier").format_qualifier(left_qualifiers) .. " "
	end

	right_qualifiers = part.qq and #part.qq > 0 and part.qq
	if right_qualifiers then
		right_qualifiers = " " .. get_module("qualifier").format_qualifier(right_qualifiers)
	end
	if part.refs and #part.refs > 0 then
		local refs = {}
		for _, ref in ipairs(part.refs) do
			if type(ref) ~= "table" then
				ref = {text = ref}
			end
			local refargs
			if ref.name or ref.group then
				refargs = {name = ref.name, group = ref.group}
			end
			table.insert(refs, mw.getCurrentFrame():extensionTag("ref", ref.text, refargs))
		end
		reftext = table.concat(refs)
	end

	local separator = part.separator or j > 1 and " <i>or</i> " -- use "" to request no separator

	if left_qualifiers then
		formatted = left_qualifiers .. formatted
	end
	if reftext then
		formatted = formatted .. reftext
	end
	if right_qualifiers then
		formatted = formatted .. right_qualifiers
	end
	if separator then
		formatted = separator .. formatted
	end

	return formatted
end


--[==[Return true if the given head is multiword according to the algorithm used in full_headword().]==]
function export.head_is_multiword(head)
	for possibleWordBreak in rgmatch(head, spacingPunctuation) do
		if rfind(possibleWordBreak, notWordPunc) then
			return true
		end
	end

	return false
end


--[==[Add links to a multiword head.]==]
function export.add_multiword_links(head)
	local function workaround_to_exclude_chars(s)
		return rsub(s, notWordPunc, "]]%1[[")
	end

	head = "[[" .. rsub(head, spacingPunctuation, workaround_to_exclude_chars) .. "]]"
	--[=[
	use this when workaround is no longer needed:

	head = "[[" .. rsub(head, WORDBREAKCHARS, "]]%1[[") .. "]]"

	Remove any empty links, which could have been created above
	at the beginning or end of the string.
	]=]
	head = head:gsub("%[%[%]%]", "")
	return head
end


local function non_categorizable(data)
	return (data.title:inNamespace("") and data.title.text:find("^Unsupported titles/"))
		or (data.title:inNamespace("Appendix") and data.title.text:find("^Gestures/"))
end


-- Format a headword with transliterations.
local function format_headword(data)
	local m_scriptutils = require("Module:script utilities")

	-- Are there non-empty transliterations?
	local has_translits = false
	local has_manual_translits = false

	------ Format the headwords. ------

	local head_parts = {}

	for j, head in ipairs(data.heads) do
		if head.tr or head.ts then
			has_translits = true
		end
		if head.tr and head.tr_manual or head.ts then
			has_manual_translits = true
		end

		local formatted

		-- Apply processing to the headword, for formatting links and such.
		if head.term:find("[[", nil, true) and head.sc:getCode() ~= "Imag" then
			formatted = get_module("links").language_link({term = head.term, lang = data.lang}, false)
		else
			formatted = data.lang:makeDisplayText(head.term, head.sc, true)
		end

		-- Add language and script wrapper.
		formatted = m_scriptutils.tag_text(formatted, data.lang, head.sc, "head", nil, j == 1 and data.id or nil)

		-- Add qualifiers, references and separator.
		table.insert(head_parts, format_term_with_qualifiers_and_refs(head, formatted, j))
	end

	head_parts = table.concat(head_parts)

	if has_manual_translits then
		-- [[Special:WhatLinksHere/Template:tracking/headword/has-manual-translit]]
		-- [[Special:WhatLinksHere/Template:tracking/headword/has-manual-translit/LANGCODE]]
		track("has-manual-translit", data.lang:getCode())
	end

	------ Format the transliterations and transcriptions. ------

	local translits_formatted

	if has_translits then
		local translit_parts = {}
		for i, head in ipairs(data.heads) do
			if head.tr or head.ts then
				local this_parts = {}
				if head.tr then
					table.insert(this_parts, m_scriptutils.tag_translit(head.tr, data.lang:getCode(), "head", nil, head.tr_manual))
					if head.ts then
						table.insert(this_parts, " ")
					end
				end
				if head.ts then
					table.insert(this_parts, "/" .. m_scriptutils.tag_transcription(head.ts, data.lang:getCode(), "head") .. "/")
				end
				table.insert(translit_parts, table.concat(this_parts))
			end
		end

		translits_formatted = " (" .. table.concat(translit_parts, " <i>or</i> ") .. ")"

		local transliteration_page = mw.title.new(data.lang:getCanonicalName() .. " transliteration", "Wiktionary")

		if transliteration_page then
			local success, exists = pcall(function () return transliteration_page.exists end)
			if success and exists then
				translits_formatted = " [[Wiktionary:" .. data.lang:getCanonicalName() .. " transliteration|•]]" .. translits_formatted
			end
		end
	else
		translits_formatted = ""
	end

	------ Paste heads and transliterations/transcriptions. ------

	return head_parts .. translits_formatted
end


local function format_genders(data)
	if data.genders and #data.genders > 0 then
		local pos_for_cat
		if not data.nogendercat and not m_data.no_gender_cat[data.lang:getCode()] then
			local pos_category = data.pos_category:gsub("^reconstructed ", "")
			pos_for_cat = m_data.pos_for_gender_number_cat[pos_category]
		end
		local text, cats = require("Module:gender and number").format_genders(data.genders, data.lang, pos_for_cat)
		for _, cat in ipairs(cats) do
			table.insert(data.categories, cat)
		end
		return "&nbsp;" .. text
	else
		return ""
	end
end


local function format_inflection_parts(data, parts)
	local any_part_translit = false

	for j, part in ipairs(parts) do
		if type(part) ~= "table" then
			part = {term = part}
		end

		local partaccel = part.accel
		local face = part.hypothetical and "hypothetical" or "bold"

		-- Here the final part 'or data.nolink' allows to have 'nolink=true'
		-- right into the 'data' table to disable links of the entire headword
		-- when inflected forms aren't entry-worthy, e.g.: in Vulgar Latin
		local nolink = part.hypothetical or part.nolink or data.nolink

		local formatted
		if part.label then
			-- FIXME: There should be a better way of italicizing a label. As is, this isn't customizable.
			formatted = "<i>" .. part.label .. "</i>"
		else
			-- Convert the term into a full link. Don't show a transliteration here unless enable_auto_translit is
			-- requested, either at the `parts` level (i.e. per inflection) or at the `data.inflections` level (i.e.
			-- specified for all inflections). This is controllable in {{head}} using autotrinfl=1 for all inflections,
			-- or fNautotr=1 for an individual inflection (remember that a single inflection may be associated with
			-- multiple terms). The reason for doing this is to avoid clutter in headword lines by default in languages
			-- where the script is relatively straightforward to read by learners (e.g. Greek, Russian), but allow it
			-- to be enabled in languages with more complex scripts (e.g. Arabic).
			local tr = part.translit or (not (parts.enable_auto_translit or data.inflections.enable_auto_translit) and "-" or nil)
			if tr ~= "-" then
				any_part_translit = true
			end
			formatted = get_module("links").full_link(
				{
					term = not nolink and part.term or nil,
					alt = part.alt or (nolink and part.term or nil),
					lang = part.lang or data.lang,
					-- FIXME, code smell in always using the first script.
					sc = part.sc or parts.sc or (not part.lang and data.heads[1].sc),
					id = part.id,
					genders = part.genders,
					tr = tr,
					ts = part.transcription,
					accel = partaccel or parts.accel,
				},
				face,
				false
				)
		end

		parts[j] = format_term_with_qualifiers_and_refs(part, formatted, j)
	end

	local parts_output

	if #parts > 0 then
		parts_output = " " .. table.concat(parts)
	elseif parts.request then
		parts_output = " <small>[please provide]</small>"
		table.insert(data.categories, "Requests for inflections in " .. data.lang:getCanonicalName() .. " entries")
	else
		parts_output = ""
	end

	return "<i>" .. parts.label .. "</i>" .. parts_output, any_part_translit
end


-- Format the inflections following the headword.
local function format_inflections(data)
	local any_part_translit = false
	if data.inflections and #data.inflections > 0 then
		-- Format each inflection individually.
		for key, infl in ipairs(data.inflections) do
			local this_any_part_translit
			data.inflections[key], this_any_part_translit = format_inflection_parts(data, infl)
			if this_any_part_translit then
				any_part_translit = true
			end
		end

		local concat_result = table.concat(data.inflections, ", ")
		return " (" .. concat_result .. ")"
	else
		return ""
	end
end

--[==[
-- Return "lemma" if the given POS is a lemma, "non-lemma form" if a non-lemma form, or nil
-- if unknown. The POS passed in must be in its plural form ("nouns", "prefixes", etc.).
-- If you have a POS in its singular form, call pluralize() in [[Module:string utilities]] to
-- pluralize it in a smart fashion that knows when to add '-s' and when to add '-es'.
--
-- If `best_guess` is given and the POS is in neither the lemma nor non-lemma list, guess
-- based on whether it ends in " forms"; otherwise, return nil.]==]
function export.pos_lemma_or_nonlemma(plpos, best_guess)
	-- Is it a lemma category?
	if isLemma[plpos] then
		return "lemma"
	end
	local plpos_no_recon = plpos:gsub("^reconstructed ", "")
	if isLemma[plpos_no_recon] then
		return "lemma"
	end
	-- Is it a nonlemma category?
	if isNonLemma[plpos] or isNonLemma[plpos_no_recon] then
		return "non-lemma form"
	end
	local plpos_no_mut = plpos:gsub("^mutated ", "")
	if isLemma[plpos_no_mut] or isNonLemma[plpos_no_mut] then
		return "non-lemma form"
	elseif best_guess then
		return plpos:find(" forms$") and "non-lemma form" or "lemma"
	else
		return nil
	end
end


-- Find and return the maximum index in the array `data[element]` (which may have gaps in it), and initialize it to a
-- zero-length array if unspecified. Check to make sure all keys are numeric (other than "maxindex", which is set by
-- [[Module:parameters]] for list parameters), all values are strings, and unless `allow_blank_string` is given,
-- no blank (zero-length) strings are present.
local function init_and_find_maximum_index(data, element, allow_blank_string)
	local maxind = 0
	if not data[element] then
		data[element] = {}
	end
	local typ = type(data[element])
	if typ ~= "table" then
		error(("In full_headword(), `data.%s` must be an array but is a %s"):format(element, typ))
	end
	for k, v in pairs(data[element]) do
		if k ~= "maxindex" then
			if type(k) ~= "number" then
				error(("Unrecognized non-numeric key '%s' in `data.%s`"):format(k, name))
			end
			if k > maxind then
				maxind = k
			end
			if v then
				if type(v) ~= "string" then
					error(("For key '%s' in `data.%s`, value should be a string but is a %s"):format(k, element, type(v)))
				end
				if not allow_blank_string and v == "" then
					error(("For key '%s' in `data.%s`, blank string not allowed; use 'false' for the default"):format(k, element))
				end
			end
		end
	end
	return maxind
end


--[==[This is the primary external entry point.
{{lua|full_headword(data)}}
This is used by {{temp|head}} and various language-specific headword templates (e.g. {{temp|ru-adj}} for Russian adjectives, {{temp|de-noun}} for German nouns, etc.) to display an entire headword line.
See [[#Further explanations for full_headword()]]
]==]
function export.full_headword(data)
	-- Prevent data from being destructively modified.
	local data = require("Module:table").shallowcopy(data)
	
	------------ 1. Basic checks for old-style (multi-arg) calling convention. ------------

	if data.getCanonicalName then
		error("In full_headword(), the first argument `data` needs to be a Lua object (table) of properties, not a language object")
	end

	if not data.lang or type(data.lang) ~= "table" or not data.lang.getCode then
		error("In full_headword(), the first argument `data` needs to be a Lua object (table) and `data.lang` must be a language object")
	end

	if data.id and type(data.id) ~= "string" then
		error("The id in the data table should be a string.")
	end

	------------ 2. Initialize pagename etc. ------------

	local langcode = data.lang:getCode()
	local langname = data.lang:getCanonicalName()

	if data.pagename then -- for testing, doc pages, etc.
		data.title = mw.title.new(data.pagename)
		if not data.title then
			error(("Bad value for `data.pagename`: '%s'"):format(data.pagename))
		end
	else
		data.title = mw.title.getCurrentTitle()
	end

	local pagename = data.title.text
	local namespace = data.title.nsText

	-- Check the namespace against the language type.
	if namespace == "" then
		if data.lang:hasType("reconstructed") then
			error("Entries in " .. langname .. " must be placed in the Reconstruction: namespace")
		elseif data.lang:hasType("appendix-constructed") then
			error("Entries in " .. langname .. " must be placed in the Appendix: namespace")
		end
	end

	------------ 3. Initialize `data.heads` table; if old-style, convert to new-style. ------------

	if type(data.heads) == "table" and type(data.heads[1]) == "table" then
		-- new-style
		if data.translits or data.transcriptions then
			error("In full_headword(), if `data.heads` is new-style (array of head objects), `data.translits` and `data.transcriptions` cannot be given")
		end
	else
		-- convert old-style `heads`, `translits` and `transcriptions` to new-style
		local maxind = math.max(
			init_and_find_maximum_index(data, "heads"),
			init_and_find_maximum_index(data, "translits", true),
			init_and_find_maximum_index(data, "transcriptions", true)
		)
		for i = 1, maxind do
			data.heads[i] = {
				term = data.heads[i],
				tr = data.translits[i],
				ts = data.transcriptions[i],
			}
		end
	end

	-- Make sure there's at least one head.
	if not data.heads[1] then
		data.heads[1] = {}
	end

	------------ 4. Initialize and validate `data.categories`, determine `pos_category` if not given, and add basic categories. ------------

	init_and_find_maximum_index(data, "categories")
	local pos_category_already_present = false
	if #data.categories > 0 then
		local escaped_langname = require("Module:pattern utilities").pattern_escape(langname)
		local matches_lang_pattern = "^" .. escaped_langname .. " "
		for _, cat in ipairs(data.categories) do
			-- Does the category begin with the language name? If not, tag it with a tracking category.
			if not cat:find(matches_lang_pattern) then
				-- [[Special:WhatLinksHere/Template:tracking/headword/no lang category]]
				-- [[Special:WhatLinksHere/Template:tracking/headword/no lang category/LANGCODE]]
				track("no lang category", langcode)
			end
		end

		-- If `pos_category` not given, try to infer it from the first specified category. If this doesn't work, we
		-- throw an error below.
		if not data.pos_category and data.categories[1]:find(matches_lang_pattern) then
			data.pos_category = data.categories[1]:gsub(matches_lang_pattern, "")
			-- Optimization to avoid inserting category already present.
			pos_category_already_present = true
		end
	end

	if not data.pos_category then
		error("`data.pos_category` not specified and could not be inferred from the categories given in "
			.. "`data.categories`. Either specify the plural part of speech in `data.pos_category` "
			.. "(e.g. \"proper nouns\") or ensure that the first category in `data.categories` is formed from the "
			.. "language's canonical name plus the plural part of speech (e.g. \"Norwegian Bokmål proper nouns\")."
			)
	end

	-- Insert a category at the beginning for the part of speech unless it's already present or `data.noposcat` given.
	if not pos_category_already_present and not data.noposcat then
		local pos_category = langname .. " " .. data.pos_category
		-- FIXME: [[User:Theknightwho]] Why is this special case here? Please add an explanatory comment.
		if pos_category ~= "Translingual Han characters" then
			table.insert(data.categories, 1, pos_category)
		end
	end

	-- Try to determine whether the part of speech refers to a lemma or a non-lemma form; if we can figure this out,
	-- add an appropriate category.
	local postype = export.pos_lemma_or_nonlemma(data.pos_category)
	if not postype then
		-- We don't know what this category is, so tag it with a tracking category.
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos]]
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/LANGCODE]]
		track("unrecognized pos", langcode)
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/POS]]
		-- [[Special:WhatLinksHere/Template:tracking/headword/unrecognized pos/POS/LANGCODE]]
		track("unrecognized pos/pos/" .. data.pos_category, langcode)
	elseif not data.noposcat then
		table.insert(data.categories, 1, langname .. " " .. postype .. "s")
	end

	------------ 5. Create a default headword, and add links to multiword page names. ------------

	-- Determine if term is reconstructed
	local is_reconstructed = namespace == "Reconstruction" or data.lang:hasType("reconstructed")

	-- Create a default headword.
	local default_head = data.title.subpageText:gsub("^Unsupported titles/", "")

	local unmodified_default_head = default_head

	-- Add links to multi-word page names when appropriate
	if not m_data.no_multiword_links[langcode] and not is_reconstructed and export.head_is_multiword(default_head) then
		default_head = export.add_multiword_links(default_head)
	end

	if is_reconstructed then
		default_head = "*" .. default_head
	end

	------------ 6. Fill in missing values in `data.heads`. ------------

	-- True if any script among the headword scripts has spaces in it.
	local any_script_has_spaces = false
	-- True if any term has a redundant head= param.
	local has_redundant_head_param = false

	for _, head in ipairs(data.heads) do

		------ 6a. If missing head, replace with default head.
		if not head.term then
			head.term = default_head
		elseif head.term == default_head then
			has_redundant_head_param = true
		end

		------ 6b. Try to detect the script(s) if not provided. If a per-head script is provided, that takes precedence,
		------     otherwise fall back to the overall script if given. If neither given, autodetect the script.

		if not head.sc then
			if data.sc then
				-- Overall script given.
				head.sc = data.sc
			else
				-- Autodetect script.
				head.sc = data.lang:findBestScript(head.term)
			end
		end

		-- If using a discouraged character sequence, add to maintenance category.
		if head.sc:hasNormalizationFixes() == true then
			local composed_head = unfc(head.term)
			if head.sc:fixDiscouragedSequences(composed_head) ~= composed_head then
				table.insert(data.categories, "Pages using discouraged character sequences")
			end
		end

		any_script_has_spaces = any_script_has_spaces or head.sc:hasSpaces()

		------ 6c. Create automatic transliterations for any non-Latin headwords without manual translit given
		------     (provided automatic translit is available, e.g. not in Persian or Hebrew).

		-- Make transliterations
		head.tr_manual = nil

		-- Try to generate a transliteration if necessary
		if head.tr == "-" then
			head.tr = nil
		elseif head.tr then
			head.tr_manual = true
		elseif not notranslit[langcode] and head.sc:isTransliterated() then
			head.tr = (data.lang:transliterate(get_module("links").remove_links(head.term), head.sc))
			head.tr = head.tr and mw.text.trim(head.tr)

			-- There is still no transliteration?
			-- Add the entry to a cleanup category.
			if not head.tr then
				head.tr = "<small>transliteration needed</small>"
				table.insert(data.categories, "Requests for transliteration of " .. langname .. " terms")
			end

			if head.tr then
				head.tr_manual = false
			end
		end

		-- Link to the transliteration entry for languages that require this.
		if head.tr and data.lang:link_tr() then
			head.tr = get_module("links").full_link {
				term = head.tr,
				lang = data.lang,
				sc = get_module("scripts").getByCode("Latn"),
				tr = "-"
			}
		end
	end

	------------ 7. Maybe tag the title with the appropriate script code, using the `display_title` mechanism. ------------

	-- Assumes that the scripts in "toBeTagged" will never occur in the Reconstruction namespace.
	-- (FIXME: Don't make assumptions like this, and if you need to do so, throw an error if the assumption is violated.)
	-- Avoid tagging ASCII as Hani even when it is tagged as Hani in the headword, as in [[check]]. The check for ASCII
	-- might need to be expanded to a check for any Latin characters and whitespace or punctuation.
	local display_title
	-- Where there are multiple headwords, use the script for the first. This assumes the first headword is similar to
	-- the pagename, and that headwords that are in different scripts from the pagename aren't first. This seems to be
	-- about the best we can do (alternatively we could potentially do script detection on the pagename).
	local dt_script = data.heads[1].sc
	local dt_script_code = dt_script:getCode()
	local page_non_ascii = namespace == "" and not pagename:find("^[%z\1-\127]+$")
	if page_non_ascii and toBeTagged[dt_script_code]
		or (dt_script_code == "Jpan" and (text_in_script(pagename, "Hira") or text_in_script(pagename, "Kana")))
		or (dt_script_code == "Kore" and text_in_script(pagename, "Hang")) then
		display_title = '<span class="' .. dt_script_code .. '">' .. pagename .. '</span>'
	-- Keep Han entries region-neutral in the display title.
	elseif page_non_ascii and (dt_script_code == "Hant" or dt_script_code == "Hans") then
		display_title = '<span class="Hani">' .. pagename .. '</span>'
	elseif namespace == "Reconstruction" then
		display_title, matched = rsubn(
			data.title.fullText,
			"^(Reconstruction:[^/]+/)(.+)$",
			function(before, term)
				return before ..
					require("Module:script utilities").tag_text(
						term,
						data.lang,
						dt_script
					)
			end
		)

		if matched == 0 then
			display_title = nil
		end
	end

	if display_title then
		local frame = mw.getCurrentFrame()
		frame:callParserFunction(
			"DISPLAYTITLE",
			display_title
		)
	end

	------------ 8. Insert additional categories. ------------

	if data.force_cat_output then
		-- [[Special:WhatLinksHere/Template:tracking/headword/force cat output]]
		track("force cat output")
	end

	if has_redundant_head_param then
		if not data.no_redundant_head_cat then
			table.insert(data.categories, langname .. " terms with redundant head parameter")
		end
	end

	-- If the first head is multiword (after removing links), maybe insert into "LANG multiword terms".
	if not data.nomultiwordcat and any_script_has_spaces and postype == "lemma" and not m_data.no_multiword_cat[langcode] then
		-- Check for spaces or hyphens, but exclude prefixes and suffixes.
		-- Use the pagename, not the head= value, because the latter may have extra
		-- junk in it, e.g. superscripted text that throws off the algorithm.
		local checkpattern = ".[%s%-፡]."
		if m_data.hyphen_not_multiword_sep[langcode] then
			-- Exclude hyphens if the data module states that they should for this language
			checkpattern = ".[%s፡]."
		end
		if rfind(unmodified_default_head, checkpattern) and not non_categorizable(data) then
			table.insert(data.categories, langname .. " multiword terms")
		end
	end

	if data.sccat then
		for _, head in ipairs(data.heads) do
			table.insert(data.categories, langname .. " " .. data.pos_category .. " in " .. head.sc:getDisplayForm())
		end
	end

	-- Categorise for unusual characters
	local standard = data.lang:getStandardCharacters()

	if standard then
		if ulen(data.title.subpageText) ~= 1 and not non_categorizable(data) then
			for character in rgmatch(data.title.subpageText, "([^" .. standard .. "])") do
				local upper = uupper(character)
				if not rfind(upper, "[" .. standard .. "]") then
					character = upper
				end
				table.insert(data.categories, langname .. " terms spelled with " .. character)
			end
		end
	end

	-- Categorise for palindromes
	if not data.nopalindromecat and namespace ~= "Reconstruction" and ulen(data.title.subpageText) > 2
		-- FIXME: Use of first script here seems hacky. What is the clean way of doing this in the presence of
		-- multiple scripts?
		and require("Module:palindromes").is_palindrome(data.title.subpageText, data.lang, data.heads[1].sc) then
		table.insert(data.categories, langname .. " palindromes")
	end

	if namespace == "" and not data.lang:hasType("reconstructed") then
		local m_links = get_module("links")
		for _, head in ipairs(data.heads) do
			if data.title.prefixedText ~= m_links.getLinkPage(m_links.remove_links(head.term), data.lang, head.sc) then
				-- [[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch]]
				-- [[Special:WhatLinksHere/Template:tracking/headword/pagename spelling mismatch/LANGCODE]]
				track("pagename spelling mismatch", data.lang:getCode())
				break
			end
		end
	end

	------------ 9. Format and return headwords, genders, inflections and categories. ------------

	-- Format and return all the gathered information. This may add more categories (e.g. gender/number categories),
	-- so make sure we do it before evaluating `data.categories`.
	local text =
		format_headword(data) ..
		format_genders(data) ..
		format_inflections(data)

	return
		text ..
		require("Module:utilities").format_categories(
			data.categories, data.lang, data.sort_key, nil,
			data.force_cat_output or test_force_categories, data.heads[1].sc
		)
end

return export