fix: spaces around '=' sign in html tag attributes

2025-04-29 20:14:49 +02:00 · 2025-04-29 20:14:49 +02:00 · 83b1ac0c83
parent 5e8c023559
commit 83b1ac0c83
1 changed files with 128 additions and 237 deletions
--- a/html.lua
+++ b/html.lua
@ -293,44 +293,15 @@ function M.tokenise( content )
 	local in_tag = nil
 	local currently_opened_quotes = nil
 	local text_memory = ""
-
+	local attr_name = nil  -- Track attribute name when we encounter whitespace before equals
 	local i = 1

 	while i <= #content do
-		local char = content:sub(i,i)
-
-
-		--
-		-- Taking care of quotes
-		--
-		if in_tag then
-			-- finding matching quotes
-			if currently_opened_quotes ~= nil and char == currently_opened_quotes then
-				currently_opened_quotes = nil
-				text_memory = text_memory .. char
-				goto continue
-			end
-
-			-- Opening a new set of quotes
-			if currently_opened_quotes == nil and (char == "'" or char == '"') then
-				currently_opened_quotes = char
-				text_memory = text_memory .. char
-				goto continue
-			end
-
-			-- reaching here means:
-			-- - we're in a tag, inside quotes
-			-- - the character is not the closing quote mark
-			-- So just add it and get on with it.
-			if currently_opened_quotes ~= nil then
-				text_memory = text_memory .. char
-				goto continue
-			end
-		end
-
-
+		local char = content:sub(i, i)

+		-- Handle comments and doctype declarations
 		if char == "<" then
+			-- Handle comments
 			if content:sub(i, i+3) == "<!--" then
 				local end_i = content:find("-->", i+3, true)
 				if end_i then
@ -338,66 +309,64 @@ function M.tokenise( content )
 				else
 					i = #content
 				end
-
 				goto continue
 			end

+			-- Handle doctype declarations
 			if content:sub(i, i+1) == "<!" then
 				i = content:find(">", i, true)
 				goto continue
 			end

-			---------------------------------
+			-- Save any accumulated text before starting a new tag
 			if #text_memory ~= 0 then
-				table.insert( TOKENS, {type="TEXT", value=text_memory} )
+				table.insert(TOKENS, {type="TEXT", value=text_memory})
 				text_memory = ""
 			end

-			-- closing tag
+			-- Reset attribute tracking
+			attr_name = nil
+
+			-- Handle closing tags
 			if content:sub(i, i+1) == "</" then
-				table.insert( TOKENS, {type="START_CLOSING_TAG"} )
+				table.insert(TOKENS, {type="START_CLOSING_TAG"})
 				in_tag = "closing"
 				i = i+1
 				goto continue
 			end

-			table.insert( TOKENS, {type="START_OPENING_TAG"} )
+			-- Handle opening tags
+			table.insert(TOKENS, {type="START_OPENING_TAG"})
 			in_tag = "opening"
 			goto continue
 		end

-
-		if char == ">" and in_tag then
-			-- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word"
+		-- Handle end of tag
+		if char == ">" and in_tag and currently_opened_quotes == nil then
+			-- Process any remaining text in the tag
 			if #text_memory ~= 0 then
 				local word = trim(text_memory)
 				if not word:match("^%s*$") then
-					table.insert( TOKENS, {type="WORD", value=word})
+					table.insert(TOKENS, {type="WORD", value=word})
 				end
 				text_memory = ""
 			end

-			table.insert( TOKENS, {type = "END_TAG"} )
+			-- Reset attribute tracking
+			attr_name = nil

-			-- closing tags don't require any more work.
-			if in_tag == "closing" then
-				in_tag = nil
-				goto continue
-			end
-			in_tag = nil
+			table.insert(TOKENS, {type="END_TAG"})

+			-- Handle special tags with raw content
+			if in_tag == "opening" then
 				local curr_token = #TOKENS
 				while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
 					curr_token = curr_token - 1
 				end
 				curr_token = curr_token + 1

-			if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then
-				error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
-			end
-
+				if curr_token <= #TOKENS and TOKENS[curr_token].type == "WORD" then
 					local tagname = TOKENS[curr_token].value
-
 					if RAW_TEXT_TAGS[tagname] then
 						local end_tag = (content:find("</"..tagname, i, true) or 0) - 1
 						if end_tag < 1 then
@ -405,199 +374,121 @@ function M.tokenise( content )
 							print(content:sub(i))
 							os.exit(-5)
 						end
+
 						local text_content = content:sub(i+1, end_tag)
-
-				-- special handling of pre
+						-- Special handling of pre
 						if tagname == "pre" then
-					-- check if it "looks" like HTML
+							-- Check if it "looks" like HTML
 							if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
-						-- tokenise the inner text
-						local text_tokens = M.tokenise( text_content )
-
-						-- and add it to the current token list
+								-- Tokenise the inner text
+								local text_tokens = M.tokenise(text_content)
+								-- Add it to the current token list
 								for _, tok in ipairs(text_tokens) do
-							table.insert( TOKENS, tok )
+									table.insert(TOKENS, tok)
 								end
-
 								i = end_tag
 								goto continue
 							end
 						end
-				-- treat the rest as text
-
+						-- Treat the rest as text
 						i = end_tag
-				table.insert( TOKENS, {type="TEXT", value=text_content} )
+						table.insert(TOKENS, {type="TEXT", value=text_content})
+						goto continue
+					end
+				end
+			end
+
+			in_tag = nil
 			goto continue
 		end

-
-
-
-			goto continue
-		end
-
-
-
-
-
-
-		----------------------------------------------------
-		---  "OLD", UNCHECKED CODE
-
-
-
-
-
-
-		-- if char == ">" and in_tag and currently_opened_quotes == nil then
-		-- 	if #text_memory ~= 0 then
-		-- 		local word = trim(text_memory)
-		-- 		if not word:match("^%s*$") then
-		-- 			table.insert( TOKENS, {type="WORD", value=word})
-		-- 		end
-		-- 		text_memory = ""
-		-- 	end
-
-
-		-- 	table.insert( TOKENS, {type = "END_TAG"} )
-
-
-		-- 	local curr_token = #TOKENS
-		-- 	while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do
-		-- 		curr_token = curr_token - 1
-		-- 	end
-		-- 	curr_token = curr_token + 1
-		-- 	if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then
-		-- 		error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
-		-- 	end
-
-		-- 	if TOKENS[curr_token].type == "START_CLOSING_TAG" then
-		-- 		goto continue
-		-- 	end
-
-
-		-- 	local tagname = TOKENS[curr_token+1].value
-
-		-- 	if RAW_TEXT_TAGS[tagname] then
-		-- 		logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.")
-
-		-- 		print(content:sub(1,i-1))
-		-- 		print(("="):rep(40))
-		-- 		print(content:sub(i))
-
-		-- 		local end_tag = content:find("</"..tagname, i, true) - 1
-		-- 		local text_content = content:sub(i+1, end_tag)
-
-		-- 		if tagname == "pre" and false then
-		-- 			-- check if it "looks" like HTML
-		-- 			if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
-		-- 				-- tokenise the inner text
-		-- 				local text_tokens = M.tokenise( text_content )
-
-		-- 				-- and add it to the current token list
-		-- 				for _, tok in ipairs(text_tokens) do
-		-- 					if tok.value == nil then
-		-- 						print( "\t::: " .. tok.type )
-		-- 					else
-		-- 						print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) )
-		-- 					end
-
-		-- 					table.insert( TOKENS, tok )
-		-- 				end
-		-- 			else
-		-- 				-- treat it as text
-		-- 				table.insert( TOKENS, {type="TEXT", value=text_content} )
-		-- 			end
-
-		-- 		end
-
-		-- 		i = end_tag
-		-- 	end
-
-
-		-- 	in_tag = false
-		-- 	goto continue
-		-- end
-
-
-
-		-- if #text_memory ~= 0 then
-		--	if in_tag and currently_opened_quotes == nil then
-		--		local word = trim(text_memory)
-
-		--		if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
-		--			if RAW_TEXT_TAGS[word] then
-		--				logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
-		--				-- made possible because of the whitespace removal at the start
-		--				i = content:find("</"..word, i, true) - 1
-		--			end
-		--		end
-
-		--		if not word:match("^%s*$") then
-		--			table.insert( TOKENS, {type="WORD", value=word})
-		--		end
-		--	else
-		--		table.insert( TOKENS, {type="TEXT", value=text_memory} )
-		--	end
-
-		--	text_memory = ""
-		-- end
-
-		-- in_tag = false
-		-- table.insert( TOKENS, {type = "END_TAG"} )
-
-		-- goto continue
-		-- end
-
-
-
+		-- Handle content within tags
 		if in_tag then
-			if currently_opened_quotes == nil and char:match("%s") then
-				if #text_memory ~= 0 then
-					local word = trim(text_memory)
-
-					-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
-					-- 	if RAW_TEXT_TAGS[word] then
-					-- 		logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
-					-- 		text_memory = ""
-
-					-- 		-- advance to closing ">"
-					-- 		i = content:find(">", i, true)
-					-- 		-- made possible because of the whitespace removal at the start
-					-- 		i = content:find("</"..word, i, true) - 1
-					-- 	end
-					-- end
-
-					if not word:match("^%s*$") then
-						table.insert( TOKENS, {type="WORD", value=word})
-						text_memory = ""
-					end
-
-					goto continue
-				end
-			end
-
-			-- if char == "'" or char == '"' then
-			--	-- found matching closing quote type
-			--	if char == currently_opened_quotes then
-			--		currently_opened_quotes = nil
-			--	elseif currently_opened_quotes == nil then
-			--		currently_opened_quotes = char
-			--	end
-			-- end
-
+			-- Handle quoted content
+			if currently_opened_quotes ~= nil then
+				if char == currently_opened_quotes then
+					-- End of quoted section
 					text_memory = text_memory .. char
-			goto continue
+					currently_opened_quotes = nil
+				else
+					-- Continue collecting quoted content
+					text_memory = text_memory .. char
+				end
+			else
+				-- Start of quoted section
+				if char == "'" or char == '"' then
+					text_memory = text_memory .. char
+					currently_opened_quotes = char
+					-- Handle equals sign
+				elseif char == "=" then
+					-- If we have an attribute name saved and empty text_memory, this is an equals after whitespace
+					if attr_name and #text_memory == 0 then
+						text_memory = attr_name .. "="
+						attr_name = nil
+					else
+						text_memory = text_memory .. "="
+					end
+					-- Handle whitespace in tags
+				elseif char:match("%s") then
+					-- If we have text and it doesn't end with =, it might be an attribute name
+					if #text_memory > 0 then
+						-- Check if the next non-whitespace char is an equals sign
+						local next_pos = i + 1
+						while next_pos <= #content do
+							local next_char = content:sub(next_pos, next_pos)
+							if not next_char:match("%s") then
+								if next_char == "=" then
+									-- This is an attribute name followed by whitespace and equals
+									attr_name = text_memory
+									text_memory = ""
+								else
+									-- This is a complete word
+									local word = trim(text_memory)
+									if not word:match("^%s*$") then
+										table.insert(TOKENS, {type="WORD", value=word})
+									end
+									text_memory = ""
+									attr_name = nil
+								end
+								break
+							end
+							next_pos = next_pos + 1
+						end
+
+						-- If we reached the end of the content
+						if next_pos > #content then
+							local word = trim(text_memory)
+							if not word:match("^%s*$") then
+								table.insert(TOKENS, {type="WORD", value=word})
+							end
+							text_memory = ""
+							attr_name = nil
+						end
+					end
 				else
 					text_memory = text_memory .. char
-			goto continue
 				end
-
+			end
+		else
+			-- We're not in a tag, so collect text content
+			text_memory = text_memory .. char
+		end

 		::continue::
-		i = i+1
+		i = i + 1
 	end

+	-- Handle any remaining text
+	if #text_memory > 0 then
+		if in_tag then
+			local word = trim(text_memory)
+			if not word:match("^%s*$") then
+				table.insert(TOKENS, {type="WORD", value=word})
+			end
+		else
+			table.insert(TOKENS, {type="TEXT", value=text_memory})
+		end
+	end

 	return TOKENS
 end