From a8a295aaf1e92cdae092377e7d77d2b6756720be Mon Sep 17 00:00:00 2001 From: Guilian Date: Sun, 19 Jan 2025 14:00:13 +0100 Subject: [PATCH] fix: ignore empty words in tokenisation --- html.lua | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/html.lua b/html.lua index 4bddea3..d317aa5 100644 --- a/html.lua +++ b/html.lua @@ -238,7 +238,9 @@ function M.tokenise( content ) end end - table.insert( TOKENS, {type="WORD", value=word}) + if not word:match("^%s*$") then + table.insert( TOKENS, {type="WORD", value=word}) + end else table.insert( TOKENS, {type="TEXT", value=text_memory} ) end @@ -271,8 +273,10 @@ function M.tokenise( content ) end end - table.insert( TOKENS, {type="WORD", value=word}) - text_memory = "" + if not word:match("^%s*$") then + table.insert( TOKENS, {type="WORD", value=word}) + text_memory = "" + end goto continue end @@ -418,7 +422,7 @@ function M.parse_tokens_into_document( TOKENS ) name = token.value:match("([%w-]+)") if name == nil then - error("Unrecognised word: " .. name) + error("Unrecognised word: " .. tostring(name) .. " (Token ".. tostring(i) .." , type=" .. tostring(token.type) .. ", value=" .. tostring(token.value) .. ")") end current_doc_element.attributes[name] = true