diff --git a/html.lua b/html.lua
index d464ead..52e8de8 100644
--- a/html.lua
+++ b/html.lua
@@ -293,44 +293,15 @@ function M.tokenise( content )
local in_tag = nil
local currently_opened_quotes = nil
local text_memory = ""
-
+ local attr_name = nil -- Track attribute name when we encounter whitespace before equals
local i = 1
while i <= #content do
- local char = content:sub(i,i)
-
-
- --
- -- Taking care of quotes
- --
- if in_tag then
- -- finding matching quotes
- if currently_opened_quotes ~= nil and char == currently_opened_quotes then
- currently_opened_quotes = nil
- text_memory = text_memory .. char
- goto continue
- end
-
- -- Opening a new set of quotes
- if currently_opened_quotes == nil and (char == "'" or char == '"') then
- currently_opened_quotes = char
- text_memory = text_memory .. char
- goto continue
- end
-
- -- reaching here means:
- -- - we're in a tag, inside quotes
- -- - the character is not the closing quote mark
- -- So just add it and get on with it.
- if currently_opened_quotes ~= nil then
- text_memory = text_memory .. char
- goto continue
- end
- end
-
-
+ local char = content:sub(i, i)
+ -- Handle comments and doctype declarations
if char == "<" then
+ -- Handle comments
if content:sub(i, i+3) == "", i+3, true)
if end_i then
@@ -338,266 +309,186 @@ function M.tokenise( content )
else
i = #content
end
-
goto continue
end
+ -- Handle doctype declarations
if content:sub(i, i+1) == "", i, true)
goto continue
end
- ---------------------------------
+ -- Save any accumulated text before starting a new tag
if #text_memory ~= 0 then
- table.insert( TOKENS, {type="TEXT", value=text_memory} )
+ table.insert(TOKENS, {type="TEXT", value=text_memory})
text_memory = ""
end
- -- closing tag
+ -- Reset attribute tracking
+ attr_name = nil
+
+ -- Handle closing tags
if content:sub(i, i+1) == "" then
- table.insert( TOKENS, {type="START_CLOSING_TAG"} )
+ table.insert(TOKENS, {type="START_CLOSING_TAG"})
in_tag = "closing"
i = i+1
goto continue
end
- table.insert( TOKENS, {type="START_OPENING_TAG"} )
+ -- Handle opening tags
+ table.insert(TOKENS, {type="START_OPENING_TAG"})
in_tag = "opening"
goto continue
end
-
- if char == ">" and in_tag then
- -- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word"
+ -- Handle end of tag
+ if char == ">" and in_tag and currently_opened_quotes == nil then
+ -- Process any remaining text in the tag
if #text_memory ~= 0 then
local word = trim(text_memory)
if not word:match("^%s*$") then
- table.insert( TOKENS, {type="WORD", value=word})
+ table.insert(TOKENS, {type="WORD", value=word})
end
text_memory = ""
end
- table.insert( TOKENS, {type = "END_TAG"} )
+ -- Reset attribute tracking
+ attr_name = nil
- -- closing tags don't require any more work.
- if in_tag == "closing" then
- in_tag = nil
- goto continue
- end
- in_tag = nil
+ table.insert(TOKENS, {type="END_TAG"})
- local curr_token = #TOKENS
- while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
- curr_token = curr_token - 1
- end
- curr_token = curr_token + 1
-
- if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then
- error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
- end
-
- local tagname = TOKENS[curr_token].value
-
- if RAW_TEXT_TAGS[tagname] then
- local end_tag = (content:find(""..tagname, i, true) or 0) - 1
- if end_tag < 1 then
- logger.printerr("Can't find closing " .. tagname .. "!")
- print(content:sub(i))
- os.exit(-5)
+ -- Handle special tags with raw content
+ if in_tag == "opening" then
+ local curr_token = #TOKENS
+ while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
+ curr_token = curr_token - 1
end
- local text_content = content:sub(i+1, end_tag)
+ curr_token = curr_token + 1
- -- special handling of pre
- if tagname == "pre" then
- -- check if it "looks" like HTML
- if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
- -- tokenise the inner text
- local text_tokens = M.tokenise( text_content )
-
- -- and add it to the current token list
- for _, tok in ipairs(text_tokens) do
- table.insert( TOKENS, tok )
+ if curr_token <= #TOKENS and TOKENS[curr_token].type == "WORD" then
+ local tagname = TOKENS[curr_token].value
+ if RAW_TEXT_TAGS[tagname] then
+ local end_tag = (content:find(""..tagname, i, true) or 0) - 1
+ if end_tag < 1 then
+ logger.printerr("Can't find closing " .. tagname .. "!")
+ print(content:sub(i))
+ os.exit(-5)
end
+ local text_content = content:sub(i+1, end_tag)
+ -- Special handling of pre
+ if tagname == "pre" then
+ -- Check if it "looks" like HTML
+ if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
+ -- Tokenise the inner text
+ local text_tokens = M.tokenise(text_content)
+ -- Add it to the current token list
+ for _, tok in ipairs(text_tokens) do
+ table.insert(TOKENS, tok)
+ end
+ i = end_tag
+ goto continue
+ end
+ end
+ -- Treat the rest as text
i = end_tag
+ table.insert(TOKENS, {type="TEXT", value=text_content})
goto continue
end
end
- -- treat the rest as text
-
- i = end_tag
- table.insert( TOKENS, {type="TEXT", value=text_content} )
- goto continue
end
-
-
-
+ in_tag = nil
goto continue
end
-
-
-
-
-
- ----------------------------------------------------
- --- "OLD", UNCHECKED CODE
-
-
-
-
-
-
- -- if char == ">" and in_tag and currently_opened_quotes == nil then
- -- if #text_memory ~= 0 then
- -- local word = trim(text_memory)
- -- if not word:match("^%s*$") then
- -- table.insert( TOKENS, {type="WORD", value=word})
- -- end
- -- text_memory = ""
- -- end
-
-
- -- table.insert( TOKENS, {type = "END_TAG"} )
-
-
- -- local curr_token = #TOKENS
- -- while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do
- -- curr_token = curr_token - 1
- -- end
- -- curr_token = curr_token + 1
- -- if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then
- -- error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
- -- end
-
- -- if TOKENS[curr_token].type == "START_CLOSING_TAG" then
- -- goto continue
- -- end
-
-
- -- local tagname = TOKENS[curr_token+1].value
-
- -- if RAW_TEXT_TAGS[tagname] then
- -- logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.")
-
- -- print(content:sub(1,i-1))
- -- print(("="):rep(40))
- -- print(content:sub(i))
-
- -- local end_tag = content:find(""..tagname, i, true) - 1
- -- local text_content = content:sub(i+1, end_tag)
-
- -- if tagname == "pre" and false then
- -- -- check if it "looks" like HTML
- -- if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
- -- -- tokenise the inner text
- -- local text_tokens = M.tokenise( text_content )
-
- -- -- and add it to the current token list
- -- for _, tok in ipairs(text_tokens) do
- -- if tok.value == nil then
- -- print( "\t::: " .. tok.type )
- -- else
- -- print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) )
- -- end
-
- -- table.insert( TOKENS, tok )
- -- end
- -- else
- -- -- treat it as text
- -- table.insert( TOKENS, {type="TEXT", value=text_content} )
- -- end
-
- -- end
-
- -- i = end_tag
- -- end
-
-
- -- in_tag = false
- -- goto continue
- -- end
-
-
-
- -- if #text_memory ~= 0 then
- -- if in_tag and currently_opened_quotes == nil then
- -- local word = trim(text_memory)
-
- -- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
- -- if RAW_TEXT_TAGS[word] then
- -- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
- -- -- made possible because of the whitespace removal at the start
- -- i = content:find(""..word, i, true) - 1
- -- end
- -- end
-
- -- if not word:match("^%s*$") then
- -- table.insert( TOKENS, {type="WORD", value=word})
- -- end
- -- else
- -- table.insert( TOKENS, {type="TEXT", value=text_memory} )
- -- end
-
- -- text_memory = ""
- -- end
-
- -- in_tag = false
- -- table.insert( TOKENS, {type = "END_TAG"} )
-
- -- goto continue
- -- end
-
-
-
+ -- Handle content within tags
if in_tag then
- if currently_opened_quotes == nil and char:match("%s") then
- if #text_memory ~= 0 then
- local word = trim(text_memory)
-
- -- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
- -- if RAW_TEXT_TAGS[word] then
- -- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
- -- text_memory = ""
-
- -- -- advance to closing ">"
- -- i = content:find(">", i, true)
- -- -- made possible because of the whitespace removal at the start
- -- i = content:find(""..word, i, true) - 1
- -- end
- -- end
-
- if not word:match("^%s*$") then
- table.insert( TOKENS, {type="WORD", value=word})
- text_memory = ""
+ -- Handle quoted content
+ if currently_opened_quotes ~= nil then
+ if char == currently_opened_quotes then
+ -- End of quoted section
+ text_memory = text_memory .. char
+ currently_opened_quotes = nil
+ else
+ -- Continue collecting quoted content
+ text_memory = text_memory .. char
+ end
+ else
+ -- Start of quoted section
+ if char == "'" or char == '"' then
+ text_memory = text_memory .. char
+ currently_opened_quotes = char
+ -- Handle equals sign
+ elseif char == "=" then
+ -- If we have an attribute name saved and empty text_memory, this is an equals after whitespace
+ if attr_name and #text_memory == 0 then
+ text_memory = attr_name .. "="
+ attr_name = nil
+ else
+ text_memory = text_memory .. "="
end
+ -- Handle whitespace in tags
+ elseif char:match("%s") then
+ -- If we have text and it doesn't end with =, it might be an attribute name
+ if #text_memory > 0 then
+ -- Check if the next non-whitespace char is an equals sign
+ local next_pos = i + 1
+ while next_pos <= #content do
+ local next_char = content:sub(next_pos, next_pos)
+ if not next_char:match("%s") then
+ if next_char == "=" then
+ -- This is an attribute name followed by whitespace and equals
+ attr_name = text_memory
+ text_memory = ""
+ else
+ -- This is a complete word
+ local word = trim(text_memory)
+ if not word:match("^%s*$") then
+ table.insert(TOKENS, {type="WORD", value=word})
+ end
+ text_memory = ""
+ attr_name = nil
+ end
+ break
+ end
+ next_pos = next_pos + 1
+ end
- goto continue
+ -- If we reached the end of the content
+ if next_pos > #content then
+ local word = trim(text_memory)
+ if not word:match("^%s*$") then
+ table.insert(TOKENS, {type="WORD", value=word})
+ end
+ text_memory = ""
+ attr_name = nil
+ end
+ end
+ else
+ text_memory = text_memory .. char
end
end
-
- -- if char == "'" or char == '"' then
- -- -- found matching closing quote type
- -- if char == currently_opened_quotes then
- -- currently_opened_quotes = nil
- -- elseif currently_opened_quotes == nil then
- -- currently_opened_quotes = char
- -- end
- -- end
-
- text_memory = text_memory .. char
- goto continue
else
+ -- We're not in a tag, so collect text content
text_memory = text_memory .. char
- goto continue
end
-
::continue::
- i = i+1
+ i = i + 1
end
+ -- Handle any remaining text
+ if #text_memory > 0 then
+ if in_tag then
+ local word = trim(text_memory)
+ if not word:match("^%s*$") then
+ table.insert(TOKENS, {type="WORD", value=word})
+ end
+ else
+ table.insert(TOKENS, {type="TEXT", value=text_memory})
+ end
+ end
return TOKENS
end