From 83b1ac0c83a62719a44bd1b3d08cc8c7be437bd3 Mon Sep 17 00:00:00 2001 From: Guilian Date: Tue, 29 Apr 2025 20:14:49 +0200 Subject: [PATCH] fix: spaces around '=' sign in html tag attributes --- html.lua | 365 +++++++++++++++++++------------------------------------ 1 file changed, 128 insertions(+), 237 deletions(-) diff --git a/html.lua b/html.lua index d464ead..52e8de8 100644 --- a/html.lua +++ b/html.lua @@ -293,44 +293,15 @@ function M.tokenise( content ) local in_tag = nil local currently_opened_quotes = nil local text_memory = "" - + local attr_name = nil -- Track attribute name when we encounter whitespace before equals local i = 1 while i <= #content do - local char = content:sub(i,i) - - - -- - -- Taking care of quotes - -- - if in_tag then - -- finding matching quotes - if currently_opened_quotes ~= nil and char == currently_opened_quotes then - currently_opened_quotes = nil - text_memory = text_memory .. char - goto continue - end - - -- Opening a new set of quotes - if currently_opened_quotes == nil and (char == "'" or char == '"') then - currently_opened_quotes = char - text_memory = text_memory .. char - goto continue - end - - -- reaching here means: - -- - we're in a tag, inside quotes - -- - the character is not the closing quote mark - -- So just add it and get on with it. - if currently_opened_quotes ~= nil then - text_memory = text_memory .. char - goto continue - end - end - - + local char = content:sub(i, i) + -- Handle comments and doctype declarations if char == "<" then + -- Handle comments if content:sub(i, i+3) == "", i+3, true) if end_i then @@ -338,266 +309,186 @@ function M.tokenise( content ) else i = #content end - goto continue end + -- Handle doctype declarations if content:sub(i, i+1) == "", i, true) goto continue end - --------------------------------- + -- Save any accumulated text before starting a new tag if #text_memory ~= 0 then - table.insert( TOKENS, {type="TEXT", value=text_memory} ) + table.insert(TOKENS, {type="TEXT", value=text_memory}) text_memory = "" end - -- closing tag + -- Reset attribute tracking + attr_name = nil + + -- Handle closing tags if content:sub(i, i+1) == "" and in_tag then - -- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word" + -- Handle end of tag + if char == ">" and in_tag and currently_opened_quotes == nil then + -- Process any remaining text in the tag if #text_memory ~= 0 then local word = trim(text_memory) if not word:match("^%s*$") then - table.insert( TOKENS, {type="WORD", value=word}) + table.insert(TOKENS, {type="WORD", value=word}) end text_memory = "" end - table.insert( TOKENS, {type = "END_TAG"} ) + -- Reset attribute tracking + attr_name = nil - -- closing tags don't require any more work. - if in_tag == "closing" then - in_tag = nil - goto continue - end - in_tag = nil + table.insert(TOKENS, {type="END_TAG"}) - local curr_token = #TOKENS - while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do - curr_token = curr_token - 1 - end - curr_token = curr_token + 1 - - if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then - error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.") - end - - local tagname = TOKENS[curr_token].value - - if RAW_TEXT_TAGS[tagname] then - local end_tag = (content:find(" 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do + curr_token = curr_token - 1 end - local text_content = content:sub(i+1, end_tag) + curr_token = curr_token + 1 - -- special handling of pre - if tagname == "pre" then - -- check if it "looks" like HTML - if text_content:find("<", 1, true) and text_content:find(">", 1, true) then - -- tokenise the inner text - local text_tokens = M.tokenise( text_content ) - - -- and add it to the current token list - for _, tok in ipairs(text_tokens) do - table.insert( TOKENS, tok ) + if curr_token <= #TOKENS and TOKENS[curr_token].type == "WORD" then + local tagname = TOKENS[curr_token].value + if RAW_TEXT_TAGS[tagname] then + local end_tag = (content:find("", 1, true) then + -- Tokenise the inner text + local text_tokens = M.tokenise(text_content) + -- Add it to the current token list + for _, tok in ipairs(text_tokens) do + table.insert(TOKENS, tok) + end + i = end_tag + goto continue + end + end + -- Treat the rest as text i = end_tag + table.insert(TOKENS, {type="TEXT", value=text_content}) goto continue end end - -- treat the rest as text - - i = end_tag - table.insert( TOKENS, {type="TEXT", value=text_content} ) - goto continue end - - - + in_tag = nil goto continue end - - - - - - ---------------------------------------------------- - --- "OLD", UNCHECKED CODE - - - - - - - -- if char == ">" and in_tag and currently_opened_quotes == nil then - -- if #text_memory ~= 0 then - -- local word = trim(text_memory) - -- if not word:match("^%s*$") then - -- table.insert( TOKENS, {type="WORD", value=word}) - -- end - -- text_memory = "" - -- end - - - -- table.insert( TOKENS, {type = "END_TAG"} ) - - - -- local curr_token = #TOKENS - -- while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do - -- curr_token = curr_token - 1 - -- end - -- curr_token = curr_token + 1 - -- if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then - -- error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.") - -- end - - -- if TOKENS[curr_token].type == "START_CLOSING_TAG" then - -- goto continue - -- end - - - -- local tagname = TOKENS[curr_token+1].value - - -- if RAW_TEXT_TAGS[tagname] then - -- logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.") - - -- print(content:sub(1,i-1)) - -- print(("="):rep(40)) - -- print(content:sub(i)) - - -- local end_tag = content:find("", 1, true) then - -- -- tokenise the inner text - -- local text_tokens = M.tokenise( text_content ) - - -- -- and add it to the current token list - -- for _, tok in ipairs(text_tokens) do - -- if tok.value == nil then - -- print( "\t::: " .. tok.type ) - -- else - -- print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) ) - -- end - - -- table.insert( TOKENS, tok ) - -- end - -- else - -- -- treat it as text - -- table.insert( TOKENS, {type="TEXT", value=text_content} ) - -- end - - -- end - - -- i = end_tag - -- end - - - -- in_tag = false - -- goto continue - -- end - - - - -- if #text_memory ~= 0 then - -- if in_tag and currently_opened_quotes == nil then - -- local word = trim(text_memory) - - -- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then - -- if RAW_TEXT_TAGS[word] then - -- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.") - -- -- made possible because of the whitespace removal at the start - -- i = content:find("" - -- i = content:find(">", i, true) - -- -- made possible because of the whitespace removal at the start - -- i = content:find(" 0 then + -- Check if the next non-whitespace char is an equals sign + local next_pos = i + 1 + while next_pos <= #content do + local next_char = content:sub(next_pos, next_pos) + if not next_char:match("%s") then + if next_char == "=" then + -- This is an attribute name followed by whitespace and equals + attr_name = text_memory + text_memory = "" + else + -- This is a complete word + local word = trim(text_memory) + if not word:match("^%s*$") then + table.insert(TOKENS, {type="WORD", value=word}) + end + text_memory = "" + attr_name = nil + end + break + end + next_pos = next_pos + 1 + end - goto continue + -- If we reached the end of the content + if next_pos > #content then + local word = trim(text_memory) + if not word:match("^%s*$") then + table.insert(TOKENS, {type="WORD", value=word}) + end + text_memory = "" + attr_name = nil + end + end + else + text_memory = text_memory .. char end end - - -- if char == "'" or char == '"' then - -- -- found matching closing quote type - -- if char == currently_opened_quotes then - -- currently_opened_quotes = nil - -- elseif currently_opened_quotes == nil then - -- currently_opened_quotes = char - -- end - -- end - - text_memory = text_memory .. char - goto continue else + -- We're not in a tag, so collect text content text_memory = text_memory .. char - goto continue end - ::continue:: - i = i+1 + i = i + 1 end + -- Handle any remaining text + if #text_memory > 0 then + if in_tag then + local word = trim(text_memory) + if not word:match("^%s*$") then + table.insert(TOKENS, {type="WORD", value=word}) + end + else + table.insert(TOKENS, {type="TEXT", value=text_memory}) + end + end return TOKENS end