htmlq/html.lua

local logger = require("logging")

local function trim(str)
	return str:match("^%s*(.-)%s*$")
end

local function shallow_copy(t)
	local t2 = {}
	for k,v in pairs(t) do
		t2[k] = v
	end
	return t2
end


local M = {}

local RAW_TEXT_TAGS = {
	script = true,
	style = true,
	pre = true
}

-- void tags are content-less, or so-called "self-closing", tags
local VOID_TAGS = {
	area = true,
	base = true,
	br = true,
	col = true,
	embed = true,
	hr = true,
	img = true,
	input = true,
	link = true,
	meta = true,
	param = true, -- deprecated
	source = true,
	track = true,
	wbr = true,
}

local INLINE_TAGS = {
	-- Text formatting
	a = true,
	abbr = true,
	b = true,
	bdi = true,
	bdo = true,
	cite = true,
	code = true,
	data = true,
	dfn = true,
	em = true,
	i = true,
	kbd = true,
	mark = true,
	q = true,
	ruby = true,
	s = true,
	samp = true,
	small = true,
	span = true,
	strong = true,
	sub = true,
	sup = true,
	time = true,
	u = true,
	var = true,

	-- Interactive elements
	button = true,
	label = true,
	select = true,
	textarea = true,

	-- Media/content
	img = true,
	picture = true,
	map = true,
	object = true,

	-- Line break
	br = true,
	wbr = true,

	-- Forms
	input = true,
	output = true,
	progress = true,
	meter = true,

	-- Scripting
	script = true,
	noscript = true,
	template = true,
}

local function normalize_whitespace(str)
	return str:gsub("%s+", " ")
end

function M.make_dom_element( tag_name, parent_elem )
	local o = {
		tag_name = tag_name,
		attributes = {},
		content = "",

		children = {},
		parent = parent_elem,

		get_child_index = function( self )
			if not self.parent then
				return -1
			end

			for i, child in ipairs(self.parent.children) do
				if child == self then return i end
			end
		end,

		get_next_sibling = function( self )
			if not self.parent then return nil end

			local found_self = false
			for _, child in ipairs(self.parent.children) do
				if found_self then
					return child
				end

				if child == self then
					found_self = true
				end
			end

			return nil
		end,

		check_simple_selector = function( self, selector )
			return M.check_simple_selector( self, selector )
		end,

		foreach = function( self, fn )
			fn( self )

			for _, child in ipairs(self.children or {}) do
				child:foreach( fn )
			end
		end,

		inner_text = function(self)
			if self.tag_name == ":text" then
				return self.content
			end

			local text = ""
			for _, child in ipairs(self.children) do
				text = text .. child:inner_text()

				if not INLINE_TAGS[child.tag_name] then
					text = text .. "\n"
				end
			end

			return text
		end,

		inner_markdown = function(self, in_pre, root_call)
			in_pre = in_pre or false
			root_call = root_call or true

			if self.tag_name == "script" or self.tag_name == "style" then
				return ""
			end


			if self.tag_name == ":text" then
				return self.content
			end

			local text = ""
			local is_list_item = self.tag_name == "li"
			local parent_is_ul = self.parent and self.parent.tag_name == "ul"
			local parent_is_ol = self.parent and self.parent.tag_name == "ol"

			local is_heading = self.tag_name:match("^h[1-6]$")
			local is_pre = self.tag_name == "pre"

			if is_heading then
				local level = tonumber(self.tag_name:sub(2))
				text = "\n" .. string.rep("#", level) .. " "
			end

			if is_list_item then
				if parent_is_ul then
					text = "* "
				elseif parent_is_ol then
					local position = self:get_child_index()
					text = position .. ". "
				end
			end


			-- Process children
			local inner = ""
			for _, child in ipairs(self.children) do
				inner = inner .. child:inner_markdown(false, in_pre or is_pre or false)
			end

			if self.tag_name == "br" then
				text = text .. "\n" .. inner
			elseif is_pre then
				text = text .. inner
			elseif is_heading then
				text = text .. normalize_whitespace(inner)
			elseif self.tag_name == "strong" then
				text = text .. "**" .. normalize_whitespace(inner) .. "**"
			elseif self.tag_name == "em" then
				text = text .. "_" .. normalize_whitespace(inner) .. "_"
			elseif self.tag_name == "code" then
				local is_block = self.parent and self.parent.tag_name == "pre"
				if is_block then
					text = text .. "\n```\n" .. inner .. "\n```\n"
				else
					text = text .. "`" .. normalize_whitespace(inner) .. "`"
				end
			elseif self.tag_name == "a" then
				text = text .. "[" .. normalize_whitespace(inner) .. "]"

				if self.attributes.href then
					text = text .. "(" .. self.attributes.href .. ")"
				end
			else
				text = text .. inner
			end

			-- Add newlines after block elements
			if not INLINE_TAGS[self.tag_name] then
				text = text .. "\n"
			end

			if root_call then
				-- Step 1: Remove whitespace between newlines
				text = text:gsub("(\n)%s+(\n)", "%1%2")
				-- Step 2: Replace 3+ consecutive newlines with just two
				text = text:gsub("\n\n\n+", "\n\n")
			end

			return text
		end
	}

	if parent_elem then
		table.insert( parent_elem.children, o )
	end

	local mt = {
		__newindex = function(table, key, value)
			-- Allow modification of existing attributes
			if rawget(table.attributes, key) ~= nil then
				rawset(table.attributes, key, value)
			else
				-- Prevent adding new attributes
				error("Cannot add new attribute to DOM element: " .. tostring(key))
			end
		end,
		__index = function(table, key)
			-- Allow access to attributes
			return rawget(table.attributes, key)
		end
	}

	setmetatable(o, mt)
	return o
end


function M.preprocess( content )
	-- remove "self closing" slashes as they MUST be ignored (spec)
	-- and would cause problems
	content = content:gsub("/%s*>", ">")
	-- remove whitespace at the start of "</closing>" tags.
	content = content:gsub("</%s*/%s*", "</")

	return content
end


function M.tokenise( content )
	local TOKENS = {}

	-- state
	local in_tag = nil
	local currently_opened_quotes = nil
	local text_memory = ""

	local i = 1

	while i <= #content do
		local char = content:sub(i,i)


		--
		-- Taking care of quotes
		--
		if in_tag then
			-- finding matching quotes
			if currently_opened_quotes ~= nil and char == currently_opened_quotes then
				currently_opened_quotes = nil
				text_memory = text_memory .. char
				goto continue
			end

			-- Opening a new set of quotes
			if currently_opened_quotes == nil and (char == "'" or char == '"') then
				currently_opened_quotes = char
				text_memory = text_memory .. char
				goto continue
			end

			-- reaching here means:
			-- - we're in a tag, inside quotes
			-- - the character is not the closing quote mark
			-- So just add it and get on with it.
			if currently_opened_quotes ~= nil then
				text_memory = text_memory .. char
				goto continue
			end
		end


		if char == "<" then
			if content:sub(i, i+3) == "<!--" then
				local end_i = content:find("-->", i+3, true)
				if end_i then
					i = end_i + 2
				else
					i = #content
				end

				goto continue
			end

			if content:sub(i, i+1) == "<!" then
				i = content:find(">", i, true)
				goto continue
			end

			---------------------------------
			if #text_memory ~= 0 then
				table.insert( TOKENS, {type="TEXT", value=text_memory} )
				text_memory = ""
			end

			-- closing tag
			if content:sub(i, i+1) == "</" then
				table.insert( TOKENS, {type="START_CLOSING_TAG"} )
				in_tag = "closing"
				i = i+1
				goto continue
			end

			table.insert( TOKENS, {type="START_OPENING_TAG"} )
			in_tag = "opening"
			goto continue
		end


		if char == ">" and in_tag then
			-- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word"
			if #text_memory ~= 0 then
				local word = trim(text_memory)
				if not word:match("^%s*$") then
					table.insert( TOKENS, {type="WORD", value=word})
				end
				text_memory = ""
			end

			table.insert( TOKENS, {type = "END_TAG"} )

			-- closing tags don't require any more work.
			if in_tag == "closing" then
				in_tag = nil
				goto continue
			end
			in_tag = nil

			local curr_token = #TOKENS
			while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
				curr_token = curr_token - 1
			end
			curr_token = curr_token + 1

			if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then
				error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
			end

			local tagname = TOKENS[curr_token].value

			if RAW_TEXT_TAGS[tagname] then
				local end_tag = (content:find("</"..tagname, i, true) or 0) - 1
				if end_tag < 1 then
					logger.printerr("Can't find closing " .. tagname .. "!")
					print(content:sub(i))
					os.exit(-5)
				end
				local text_content = content:sub(i+1, end_tag)

				-- special handling of pre
				if tagname == "pre" then
					-- check if it "looks" like HTML
					if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
						-- tokenise the inner text
						local text_tokens = M.tokenise( text_content )

						-- and add it to the current token list
						for _, tok in ipairs(text_tokens) do
							table.insert( TOKENS, tok )
						end

						i = end_tag
						goto continue
					end
				end
				-- treat the rest as text

				i = end_tag
				table.insert( TOKENS, {type="TEXT", value=text_content} )
				goto continue
			end


			goto continue
		end


		----------------------------------------------------
		---  "OLD", UNCHECKED CODE


		-- if char == ">" and in_tag and currently_opened_quotes == nil then
		-- 	if #text_memory ~= 0 then
		-- 		local word = trim(text_memory)
		-- 		if not word:match("^%s*$") then
		-- 			table.insert( TOKENS, {type="WORD", value=word})
		-- 		end
		-- 		text_memory = ""
		-- 	end


		-- 	table.insert( TOKENS, {type = "END_TAG"} )


		-- 	local curr_token = #TOKENS
		-- 	while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do
		-- 		curr_token = curr_token - 1
		-- 	end
		-- 	curr_token = curr_token + 1
		-- 	if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then
		-- 		error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
		-- 	end

		-- 	if TOKENS[curr_token].type == "START_CLOSING_TAG" then
		-- 		goto continue
		-- 	end


		-- 	local tagname = TOKENS[curr_token+1].value

		-- 	if RAW_TEXT_TAGS[tagname] then
		-- 		logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.")

		-- 		print(content:sub(1,i-1))
		-- 		print(("="):rep(40))
		-- 		print(content:sub(i))

		-- 		local end_tag = content:find("</"..tagname, i, true) - 1
		-- 		local text_content = content:sub(i+1, end_tag)

		-- 		if tagname == "pre" and false then
		-- 			-- check if it "looks" like HTML
		-- 			if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
		-- 				-- tokenise the inner text
		-- 				local text_tokens = M.tokenise( text_content )

		-- 				-- and add it to the current token list
		-- 				for _, tok in ipairs(text_tokens) do
		-- 					if tok.value == nil then
		-- 						print( "\t::: " .. tok.type )
		-- 					else
		-- 						print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) )
		-- 					end

		-- 					table.insert( TOKENS, tok )
		-- 				end
		-- 			else
		-- 				-- treat it as text
		-- 				table.insert( TOKENS, {type="TEXT", value=text_content} )
		-- 			end

		-- 		end

		-- 		i = end_tag
		-- 	end


		-- 	in_tag = false
		-- 	goto continue
		-- end


		-- if #text_memory ~= 0 then
		--	if in_tag and currently_opened_quotes == nil then
		--		local word = trim(text_memory)

		--		if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
		--			if RAW_TEXT_TAGS[word] then
		--				logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
		--				-- made possible because of the whitespace removal at the start
		--				i = content:find("</"..word, i, true) - 1
		--			end
		--		end

		--		if not word:match("^%s*$") then
		--			table.insert( TOKENS, {type="WORD", value=word})
		--		end
		--	else
		--		table.insert( TOKENS, {type="TEXT", value=text_memory} )
		--	end

		--	text_memory = ""
		-- end

		-- in_tag = false
		-- table.insert( TOKENS, {type = "END_TAG"} )

		-- goto continue
		-- end


		if in_tag then
			if currently_opened_quotes == nil and char:match("%s") then
				if #text_memory ~= 0 then
					local word = trim(text_memory)

					-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
					-- 	if RAW_TEXT_TAGS[word] then
					-- 		logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
					-- 		text_memory = ""

					-- 		-- advance to closing ">"
					-- 		i = content:find(">", i, true)
					-- 		-- made possible because of the whitespace removal at the start
					-- 		i = content:find("</"..word, i, true) - 1
					-- 	end
					-- end

					if not word:match("^%s*$") then
						table.insert( TOKENS, {type="WORD", value=word})
						text_memory = ""
					end

					goto continue
				end
			end

			-- if char == "'" or char == '"' then
			--	-- found matching closing quote type
			--	if char == currently_opened_quotes then
			--		currently_opened_quotes = nil
			--	elseif currently_opened_quotes == nil then
			--		currently_opened_quotes = char
			--	end
			-- end

			text_memory = text_memory .. char
			goto continue
		else
			text_memory = text_memory .. char
			goto continue
		end


		::continue::
		i = i+1
	end


	return TOKENS
end


function M.check_simple_selector(element, selector)
	-- Skip text nodes
	if element.tag_name == ":text" then
		return false
	end

	-- Check tag name if specified
	if selector.tag_name and element.tag_name ~= selector.tag_name then
		return false
	end

	-- Check ID if specified
	if selector.id and element.attributes.id ~= selector.id then
		return false
	end

	-- Check classes if specified
	if selector.class and #selector.class > 0 then
		local element_classes = element.attributes.class
		if not element_classes then
			return false
		end

		for _, class in ipairs(selector.class) do
			local found = false
			for _, elem_class in ipairs(element_classes) do
				if elem_class == class then
					found = true
					break
				end
			end
			if not found then
				return false
			end
		end
	end

	for attr_name, attr_value in pairs(selector.attributes_values) do
		local elem_attr_value = element.attributes[attr_name]
		if elem_attr_value ~= attr_value then
			return false
		end
	end

	-- Check attribute presence selectors
	for _, attr_name in ipairs(selector.attributes_present) do
		if not element.attributes[attr_name] then
			return false
		end
	end

	return true
end

function M.query_simple_selector(document, selector)
	local matches = {}

	local function traverse(node)
		if M.check_simple_selector(node, selector) then
			table.insert(matches, node)
		end

		for _, child in ipairs(node.children) do
			traverse(child)
		end
	end

	traverse(document)
	return matches
end


function M.parse_tokens_into_document( TOKENS )
	local DOCUMENT = M.make_dom_element(nil, nil)
	local current_doc_element = DOCUMENT
	local in_opening_tag_for = nil

	local i = 1
	while i <= #TOKENS do
		local token = TOKENS[i]

		if token.type == "WORD" then
			if current_doc_element.tag_name == ":text" then
				current_doc_element = current_doc_element.parent
			end


			if i > 0 and TOKENS[i-1].type == "START_OPENING_TAG" then
				local new_elem = M.make_dom_element( token.value, current_doc_element )
				current_doc_element = new_elem
				in_opening_tag_for = token.value

				goto continue
			end

			if i > 0 and TOKENS[i-1].type == "START_CLOSING_TAG" then
				local curr_elem = current_doc_element

				-- If we find a closing tag, check if:
				-- - That tag is a void tag (childless, auto-closing)
				-- - The last child added to the current element is that tag
				--
				-- This avoids having <img> tags as parents of <p> tags for example
				local last_child = curr_elem.children[#curr_elem.children]
				if last_child and VOID_TAGS[last_child.tag_name] and last_child.tag_name == token.value then
					goto continue
				end

				while curr_elem.parent and curr_elem.tag_name ~= token.value do
					curr_elem = curr_elem.parent
				end

				if curr_elem.parent == nil then
					-- reached DOCUMENT root
					logger.printerr("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
					current_doc_element = DOCUMENT
				else
					current_doc_element = curr_elem.parent
				end


				goto continue
			end


			if in_opening_tag_for then
				local pattern = "([^=]+)=['\"](.+)['\"]"

				local name, raw_value = token.value:match(pattern)

				if name == nil or raw_value == nil then
					name = token.value:match("([%w-]+)")

					if name == nil then
						error("Unrecognised word: " .. tostring(name) .. " (Token ".. tostring(i) .." , type=" .. tostring(token.type) .. ", value=" .. tostring(token.value) .. ")")
					end

					current_doc_element.attributes[name] = true

					goto continue
				end


				local value = nil
				if raw_value == "" or raw_value == nil then
					value = nil
				else
					value = trim(raw_value)

					if name == "class" then
						local classes = {}

						for class in value:gmatch("%S+") do
							table.insert( classes, class )
						end

						value = classes
					end
				end

				current_doc_element.attributes[name] = value

				goto continue
			end

		end


		if token.type == "END_TAG" then
			if in_opening_tag_for then
				if VOID_TAGS[in_opening_tag_for] then
					if current_doc_element.parent == nil then
						-- reached DOCUMENT root
						current_doc_element = DOCUMENT
					else
						current_doc_element = current_doc_element.parent
					end
				end

			end

			in_opening_tag_for = nil

			goto continue
		end


		if token.type == "TEXT" then
			local new_elem = M.make_dom_element( ":text", current_doc_element )
			new_elem.content = token.value
			current_doc_element = new_elem

			goto continue
		end


		::continue::
		i = i+1
	end

	M.clean_text_nodes( DOCUMENT )

	return DOCUMENT
end


function M.clean_text_nodes(node)
	if node.tag_name ~= ":text" then
		-- Don't clean anything in raw text tags
		if RAW_TEXT_TAGS[node.tag_name] then
			return
		end

		for _, child in ipairs( shallow_copy(node.children) ) do
			M.clean_text_nodes( child )
		end
		return
	end

	-- purge content-less text nodes
	if #trim(node.content) == 0 then
		if not node.parent then
			error("Text node without a parent; should be impossible !")
		end

		for i, child in ipairs( shallow_copy(node.parent.children) ) do
			if child == node then
				table.remove( node.parent.children, i )
				break
			end
		end

		return
	end

	node.content = node.content:gsub("%s+", " ")
end


function M._tostring(node, indent, include_internal_pseudoelements)
	-- Default indentation is 0 (root level)
	indent = indent or 0
	include_internal_pseudoelements = include_internal_pseudoelements or false

	local is_pseudo_element = (node.tag_name or ":root"):sub(1,1) == ":"


	local indent_level_str = "  "
	-- Create the indentation string (e.g., "  " for each level)
	local indent_str = string.rep(indent_level_str, indent)

	if node.tag_name == ":text" then
		local str = ""

		if include_internal_pseudoelements then
			str = str .. "<:text>"
		end

		str = str .. node.content

		if include_internal_pseudoelements then
			str = str .. "</:text>"
		end

		return str
	end

	local node_name = ""

	if not is_pseudo_element or include_internal_pseudoelements then
		-- Print the current node's tag name
		node_name = node_name .. "\n" .. indent_str .. "<" .. (node.tag_name or ":root")
	end

	-- Print attributes if any
	if next(node.attributes) ~= nil then
		for attr, value in pairs(node.attributes) do
			if type(value) == "table" then
				node_name = node_name .. " " .. attr .. "=\""
				for i, val in ipairs( value ) do
					if i > 1 then node_name = node_name .. " " end
					node_name = node_name .. tostring(val)
				end
				node_name = node_name .. "\""
			else
				node_name = node_name .. " " .. attr .. "=\"" .. tostring(value) .. "\""
			end
		end
	end

	if not is_pseudo_element or include_internal_pseudoelements then
		node_name = node_name .. ">"
	end

	local next_indent = indent + 1
	if is_pseudo_element and not include_internal_pseudoelements then
		next_indent = indent
	end

	-- Recursively print children
	for _, child in ipairs(node.children) do
		node_name = node_name .. M._tostring(child, next_indent, include_internal_pseudoelements)
	end

	if not VOID_TAGS[node.tag_name] and ( not is_pseudo_element or include_internal_pseudoelements ) then
		-- Print the closing tag
		local end_indent = ""
		local closing_text_tag = "</:text>"
		if node_name:sub(#node_name, #node_name) == ">" and node_name:sub(#node_name - #closing_text_tag + 1, #node_name) ~= closing_text_tag then
			end_indent = "\n" .. indent_str
		end
		node_name = node_name .. end_indent .. "</" .. (node.tag_name or ":root") .. ">"
	end

	return node_name
end

function M.tostring(node, base_indent, include_internal_pseudoelements)
	return trim( M._tostring(node, base_indent, include_internal_pseudoelements) )
end


function M.parse( html_string )
	local clean_html = M.preprocess( html_string )

	local tokens = M.tokenise( clean_html )

	local document = M.parse_tokens_into_document( tokens )

	return document
end

return M