feat: html parsing

Also, initial commit :)
2025-01-14 19:48:32 +01:00 · 2025-01-14 19:48:32 +01:00 · 111da7d663
commit 111da7d663
3 changed files with 641 additions and 0 deletions
--- a/html.lua
+++ b/html.lua
@ -0,0 +1,469 @@
+
+local function trim(str)
+	return str:match("^%s*(.-)%s*$")
+end
+
+
+
+local M = {}
+
+local RAW_TEXT_TAGS = {
+	script = true,
+	style = true,
+	pre = true
+}
+
+-- void tags are content-less, or so-called "self-closing", tags
+local VOID_TAGS = {
+	area = true,
+	base = true,
+	br = true,
+	col = true,
+	embed = true,
+	hr = true,
+	img = true,
+	input = true,
+	link = true,
+	meta = true,
+	param = true, -- deprecated
+	source = true,
+	track = true,
+	wbr = true,
+}
+
+
+function M.make_dom_element( tag_name, parent_elem )
+	local o = {
+		tag_name = tag_name,
+		parent = parent_elem,
+		children = {},
+		attributes = {},
+		content = ""
+	}
+
+	if parent_elem then
+		table.insert( parent_elem.children, o )
+	end
+
+	local mt = {
+		__newindex = function(table, key, value)
+			-- Allow modification of existing attributes
+			if rawget(table.attributes, key) ~= nil then
+				rawset(table.attributes, key, value)
+			else
+				-- Prevent adding new attributes
+				error("Cannot add new attribute to DOM element: " .. tostring(key))
+			end
+		end,
+		__index = function(table, key)
+			-- Allow access to attributes
+			return rawget(table.attributes, key)
+		end
+	}
+
+	setmetatable(o, mt)
+	return o
+end
+
+
+
+function M.preprocess( content )
+	-- remove "self closing" slashes as they MUST be ignored (spec)
+	-- and would cause problems
+	content = content:gsub("/%s*>", ">")
+	-- remove whitespace at the start of "</closing>" tags.
+	content = content:gsub("</%s*/%s*", "</")
+
+	return content
+end
+
+
+function M.tokenise( content )
+	local TOKENS = {}
+
+	-- state
+	local in_tag = false
+	local currently_opened_quotes = nil
+	local text_memory = ""
+
+	local skipping_from = nil
+	local skip_target = nil
+	local skip_mode = "before"
+
+	local function set_skipping_to( str, mode )
+		mode = mode or "before"
+		if mode ~= "before" and mode ~= "after" then
+			error("Unexpected skipping mode: " .. mode .. ", in looking for " .. str)
+		end
+
+		skip_target = str
+		skip_mode = mode
+	end
+
+
+
+	local i = 1
+
+	while i <= #content do
+		local char = content:sub(i,i)
+
+		if skip_target ~= nil then
+			if skipping_from == nil then
+				skipping_from = i
+			end
+
+			if skip_mode == "before" then
+				local end_i =  i + #skip_target - 1
+
+				if trim(content:sub(i, end_i)) == skip_target then
+					table.insert( TOKENS, {type="TEXT", value=content:sub(skipping_from, i-1)} )
+
+					-- release from skip
+					--i = end_i - 1
+					i = i - 1
+					skip_target = nil
+					skipping_from = nil
+				end
+
+				goto continue
+			else
+				local start_i =  i - #skip_target + 1
+
+				if trim(content:sub(start_i, i)) == skip_target then
+					table.insert( TOKENS, {type="TEXT", value=content:sub(skipping_from, start_i-1)} )
+
+					-- release from skip
+					i = start_i
+					skip_target = nil
+					skipping_from = nil
+				end
+
+				goto continue
+			end
+
+
+
+		end
+
+
+
+
+		if char == "<" then
+			if content:sub(i, i+3) == "<!--" then
+				set_skipping_to("-->", "after")
+				goto continue
+			end
+
+			if content:sub(i, i+1) == "<!" then
+				i = content:find(">", i)
+				goto continue
+			end
+
+			---------------------------------
+			if #text_memory ~= 0 then
+				table.insert( TOKENS, {type="TEXT", value=text_memory} )
+				text_memory = ""
+			end
+
+			in_tag = true
+
+			-- closing tag
+			if content:sub(i, i+1) == "</" then
+				table.insert( TOKENS, {type="START_CLOSING_TAG"} )
+				i = i+1
+				goto continue
+			end
+
+			table.insert( TOKENS, {type="START_OPENING_TAG"} )
+			goto continue
+		end
+
+		if char == ">" then
+			if #text_memory ~= 0 then
+				if in_tag and currently_opened_quotes == nil then
+					local word = trim(text_memory)
+
+					if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
+						if RAW_TEXT_TAGS[word] then
+							print("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
+							-- made possible because of the whitespace removal at the start
+							set_skipping_to("</" .. word)
+						end
+					end
+
+					table.insert( TOKENS, {type="WORD", value=word})
+				else
+					table.insert( TOKENS, {type="TEXT", value=text_memory} )
+				end
+
+				text_memory = ""
+			end
+
+			in_tag = false
+			table.insert( TOKENS, {type = "END_TAG"} )
+
+			goto continue
+		end
+
+
+
+		if in_tag then
+			if currently_opened_quotes == nil and char:match("%s") then
+				if #text_memory ~= 0 then
+					local word = trim(text_memory)
+
+					if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
+						if RAW_TEXT_TAGS[word] then
+							print("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
+							-- made possible because of the whitespace removal at the start
+							set_skipping_to("</" .. word)
+							text_memory = ""
+
+							-- advance to closing ">"
+							i = content:find(">", i)
+						end
+					end
+
+					table.insert( TOKENS, {type="WORD", value=word})
+					text_memory = ""
+
+					goto continue
+				end
+			end
+
+
+			if char == "'" or char == '"' then
+				if currently_opened_quotes == char then
+					currently_opened_quotes = nil
+				else
+					currently_opened_quotes = char
+				end
+
+				text_memory = text_memory .. char
+				goto continue
+			end
+
+			text_memory = text_memory .. char
+			goto continue
+		else
+			text_memory = text_memory .. char
+			goto continue
+		end
+
+
+		::continue::
+		i = i+1
+	end
+
+
+	return TOKENS
+end
+
+
+function M.parse_tokens_into_document( TOKENS )
+	local DOCUMENT = M.make_dom_element(nil, nil)
+	local current_doc_element = DOCUMENT
+	local in_opening_tag_for = nil
+
+	local i = 1
+	while i <= #TOKENS do
+		local token = TOKENS[i]
+
+		if token.type == "WORD" then
+			if current_doc_element.tag_name == "#text" then
+				current_doc_element = current_doc_element.parent
+			end
+
+
+			if i > 0 and TOKENS[i-1].type == "START_OPENING_TAG" then
+				local new_elem = M.make_dom_element( token.value, current_doc_element )
+				current_doc_element = new_elem
+				in_opening_tag_for = token.value
+
+				goto continue
+			end
+
+			if i > 0 and TOKENS[i-1].type == "START_CLOSING_TAG" then
+				local curr_elem = current_doc_element
+
+				while curr_elem.parent and curr_elem.tag_name ~= token.value do
+					curr_elem = curr_elem.parent
+				end
+
+				if curr_elem.parent == nil then
+					-- reached DOCUMENT root
+					print("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
+					current_doc_element = DOCUMENT
+				else
+					current_doc_element = curr_elem.parent
+				end
+
+
+				goto continue
+			end
+
+
+
+			if in_opening_tag_for then
+				local pattern = "([%w-]+)=['\"](.-)['\"]"
+
+				local name, raw_value = token.value:match(pattern)
+
+				if name == nil or raw_value == nil then
+					name = token.value:match("([%w-]+)")
+
+					if name == nil then
+						error("Unrecognised word: " .. name)
+					end
+
+					current_doc_element.attributes[name] = true
+
+					goto continue
+				end
+
+
+
+				local value = nil
+				if raw_value == "" or raw_value == nil then
+					value = nil
+					--elseif raw_value:find("%S+%s+%S+") then
+					--	value = {}
+					--	print(raw_value)
+					--	for word in raw_value:gmatch("%S+") do
+					--		table.insert( value, word )
+					--	end
+				else
+					value = trim(raw_value)
+				end
+
+				current_doc_element.attributes[name] = value
+
+				goto continue
+			end
+
+		end
+
+
+		if token.type == "END_TAG" then
+			if in_opening_tag_for then
+				if VOID_TAGS[in_opening_tag_for] then
+					if current_doc_element.parent == nil then
+						-- reached DOCUMENT root
+						current_doc_element = DOCUMENT
+					else
+						current_doc_element = current_doc_element.parent
+					end
+				end
+
+			end
+
+			in_opening_tag_for = nil
+
+			goto continue
+		end
+
+
+		if token.type == "TEXT" then
+			local new_elem = M.make_dom_element( "#text", current_doc_element )
+			new_elem.content = token.value
+			current_doc_element = new_elem
+
+			goto continue
+		end
+
+
+		::continue::
+		i = i+1
+	end
+
+	return DOCUMENT
+end
+
+
+function M.clean_text_nodes(node)
+	if node.tag_name ~= "#text" then
+		-- Don't clean anything in raw text tags
+		if RAW_TEXT_TAGS[node.tag_name] then
+			return
+		end
+
+		for _, child in ipairs(node.children) do
+			M.clean_text_nodes( child )
+		end
+		return
+	end
+
+	-- purge content-less text nodes
+	if #trim(node.content) == 0 then
+		if not node.parent then
+			error("Text node without a parent; should be impossible !")
+		end
+
+		for i, child in ipairs(node.parent.children) do
+			if child == node then
+				table.remove( node.parent.children, i )
+				break
+			end
+		end
+
+		return
+	end
+
+	node.content = trim( node.content:gsub("%s+", " ") )
+end
+
+
+function M.print_document(node, indent)
+	-- Default indentation is 0 (root level)
+	indent = indent or 0
+
+	local indent_level_str = "  "
+	-- Create the indentation string (e.g., "  " for each level)
+	local indent_str = string.rep(indent_level_str, indent)
+
+	if node.tag_name == "#text" then
+		print(indent_str .. "<#text>\n" .. node.content .. "\n" .. indent_str .. "</#text>")
+		return
+	end
+
+	local node_name = ""
+
+	-- Print the current node's tag name
+	node_name = node_name .. indent_str .. "<" .. (node.tag_name or "#root")
+
+	-- Print attributes if any
+	if next(node.attributes) ~= nil then
+		for attr, value in pairs(node.attributes) do
+			--print(indent_str .. "  " .. attr .. " = " .. tostring(value))
+			node_name = node_name .. " " .. attr .. "=\"" .. tostring(value) .. "\""
+		end
+	end
+
+	node_name = node_name .. ">"
+
+	print( node_name )
+
+
+	-- Recursively print children
+	for _, child in ipairs(node.children) do
+		M.print_document(child, indent + 1)
+	end
+
+	-- Print the closing tag
+	print(indent_str .. "</" .. (node.tag_name or "#root") .. ">")
+end
+
+
+function M.parse( html_string )
+	local clean_html = M.preprocess( html_string )
+
+	local tokens = M.tokenise( clean_html )
+
+	local document = M.parse_tokens_into_document( tokens )
+
+	local cleaned_doc = M.clean_text_nodes( document )
+
+	return cleaned_doc
+end
+
+return M
--- a/main.lua
+++ b/main.lua
@ -0,0 +1,14 @@
+#!/bin/env lua
+
+local html = require(".html")
+
+
+local file = io.open("test.html", "r")
+
+if file == nil then
+	error("File doesn't exist")
+end
+
+local content = file:read("a")
+
+html.print_document( html.parse( content ) )
--- a/test.html
+++ b/test.html
@ -0,0 +1,158 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Test Page for htmlq</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            background-color: #f4f4f4;
+            margin: 0;
+            padding: 20px;
+        }
+        .container {
+            max-width: 800px;
+            margin: 0 auto;
+            background-color: #fff;
+            padding: 20px;
+            border-radius: 8px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+        }
+        .header {
+            text-align: center;
+            margin-bottom: 20px;
+        }
+        .header h1 {
+            color: #333;
+        }
+        .content {
+            display: flex;
+            justify-content: space-between;
+        }
+        .sidebar {
+            width: 30%;
+            background-color: #eaeaea;
+            padding: 15px;
+            border-radius: 8px;
+        }
+        .sidebar ul {
+            list-style-type: none;
+            padding: 0;
+        }
+        .sidebar ul li {
+            margin-bottom: 10px;
+        }
+        .sidebar ul li a {
+            text-decoration: none;
+            color: #333;
+        }
+        .main-content {
+            width: 65%;
+        }
+        .main-content .article {
+            margin-bottom: 20px;
+            padding: 15px;
+            background-color: #f9f9f9;
+            border-radius: 8px;
+        }
+        .main-content .article h2 {
+            color: #555;
+        }
+        .main-content .article p {
+            color: #666;
+        }
+        .footer {
+            text-align: center;
+            margin-top: 20px;
+            padding-top: 20px;
+            border-top: 1px solid #ddd;
+        }
+        .footer p {
+            color: #777;
+        }
+        #special-note {
+            background-color: #ffeb3b;
+            padding: 10px;
+            border-radius: 8px;
+            margin-top: 20px;
+        }
+        .duplicate-class {
+            background-color: #d1c4e9;
+            padding: 10px;
+            border-radius: 8px;
+            margin: 10px 0;
+        }
+        .duplicate-class > p {
+            color: #512da8;
+        }
+        .duplicate-class + .duplicate-class {
+            margin-top: 20px;
+        }
+        .duplicate-class ~ .duplicate-class {
+            border: 2px solid #673ab7;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>Welcome to the Test Page</h1>
+        </div>
+        <div class="content">
+            <div class="sidebar">
+                <ul>
+                    <li><a href="#section1">Section 1</a></li>
+                    <li><a href="#section2">Section 2</a></li>
+                    <li><a href="#section3">Section 3</a></li>
+                </ul>
+            </div>
+            <div class="main-content">
+                <div class="article" id="section1">
+                    <h2>Section 1: Introduction</h2>
+                    <p>This is the introduction section. It provides an overview of the content.</p>
+                    <div class="duplicate-class">
+                        <p>This is a duplicate class element inside Section 1.</p>
+                    </div>
+                </div>
+                <div class="article" id="section2">
+                    <h2>Section 2: Main Content</h2>
+                    <p>This is the main content section. It contains the bulk of the information.</p>
+                    <div class="nested">
+                        <p>This is a nested paragraph inside the main content.</p>
+                        <div class="deeply-nested">
+                            <p>This is a deeply nested paragraph.</p>
+                        </div>
+                    </div>
+                    <div class="duplicate-class">
+                        <p>This is another duplicate class element inside Section 2.</p>
+                    </div>
+                    <div class="duplicate-class">
+                        <p>This is yet another duplicate class element inside Section 2.</p>
+                    </div>
+                </div>
+                <div class="article" id="section3">
+                    <h2>Section 3: Conclusion</h2>
+                    <p>This is the conclusion section. It summarizes the content.</p>
+                    <div class="duplicate-class">
+                        <p>This is a duplicate class element inside Section 3.</p>
+                    </div>
+                </div>
+            </div>
+        </div>
+        <div id="special-note">
+            <p>This is a special note with a unique ID.</p>
+        </div>
+        <div class="footer">
+            <p>&copy; 2023 Test Page. All rights reserved.</p>
+        </div>
+    </div>
+
+    <script>
+        document.addEventListener('DOMContentLoaded', function() {
+            console.log('Document is fully loaded and ready to be processed.');
+        });
+    </script>
+</body>
+</html>
+