2025-01-20 21:16:24 +01:00
|
|
|
local logger = require("logging")
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
local function trim(str)
|
|
|
|
return str:match("^%s*(.-)%s*$")
|
|
|
|
end
|
|
|
|
|
2025-01-18 18:50:03 +01:00
|
|
|
local function shallow_copy(t)
|
2025-01-20 17:18:02 +01:00
|
|
|
local t2 = {}
|
|
|
|
for k,v in pairs(t) do
|
|
|
|
t2[k] = v
|
|
|
|
end
|
|
|
|
return t2
|
2025-01-18 18:50:03 +01:00
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
|
|
|
|
local M = {}
|
|
|
|
|
|
|
|
local RAW_TEXT_TAGS = {
|
|
|
|
script = true,
|
|
|
|
style = true,
|
|
|
|
pre = true
|
|
|
|
}
|
|
|
|
|
|
|
|
-- void tags are content-less, or so-called "self-closing", tags
|
|
|
|
local VOID_TAGS = {
|
|
|
|
area = true,
|
|
|
|
base = true,
|
|
|
|
br = true,
|
|
|
|
col = true,
|
|
|
|
embed = true,
|
|
|
|
hr = true,
|
|
|
|
img = true,
|
|
|
|
input = true,
|
|
|
|
link = true,
|
|
|
|
meta = true,
|
|
|
|
param = true, -- deprecated
|
|
|
|
source = true,
|
|
|
|
track = true,
|
|
|
|
wbr = true,
|
|
|
|
}
|
|
|
|
|
2025-01-20 17:05:04 +01:00
|
|
|
local INLINE_TAGS = {
|
|
|
|
-- Text formatting
|
|
|
|
a = true,
|
|
|
|
abbr = true,
|
|
|
|
b = true,
|
|
|
|
bdi = true,
|
|
|
|
bdo = true,
|
|
|
|
cite = true,
|
|
|
|
code = true,
|
|
|
|
data = true,
|
|
|
|
dfn = true,
|
|
|
|
em = true,
|
|
|
|
i = true,
|
|
|
|
kbd = true,
|
|
|
|
mark = true,
|
|
|
|
q = true,
|
|
|
|
ruby = true,
|
|
|
|
s = true,
|
|
|
|
samp = true,
|
|
|
|
small = true,
|
|
|
|
span = true,
|
|
|
|
strong = true,
|
|
|
|
sub = true,
|
|
|
|
sup = true,
|
|
|
|
time = true,
|
|
|
|
u = true,
|
|
|
|
var = true,
|
|
|
|
|
|
|
|
-- Interactive elements
|
|
|
|
button = true,
|
|
|
|
label = true,
|
|
|
|
select = true,
|
|
|
|
textarea = true,
|
|
|
|
|
|
|
|
-- Media/content
|
|
|
|
img = true,
|
|
|
|
picture = true,
|
|
|
|
map = true,
|
|
|
|
object = true,
|
|
|
|
|
|
|
|
-- Line break
|
|
|
|
br = true,
|
|
|
|
wbr = true,
|
|
|
|
|
|
|
|
-- Forms
|
|
|
|
input = true,
|
|
|
|
output = true,
|
|
|
|
progress = true,
|
|
|
|
meter = true,
|
|
|
|
|
|
|
|
-- Scripting
|
|
|
|
script = true,
|
|
|
|
noscript = true,
|
|
|
|
template = true,
|
|
|
|
}
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
local function normalize_whitespace(str)
|
|
|
|
return str:gsub("%s+", " ")
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
function M.make_dom_element( tag_name, parent_elem )
|
|
|
|
local o = {
|
|
|
|
tag_name = tag_name,
|
|
|
|
attributes = {},
|
2025-01-18 13:03:01 +01:00
|
|
|
content = "",
|
|
|
|
|
|
|
|
children = {},
|
|
|
|
parent = parent_elem,
|
|
|
|
|
|
|
|
get_child_index = function( self )
|
|
|
|
if not self.parent then
|
|
|
|
return -1
|
|
|
|
end
|
|
|
|
|
|
|
|
for i, child in ipairs(self.parent.children) do
|
|
|
|
if child == self then return i end
|
|
|
|
end
|
|
|
|
end,
|
|
|
|
|
2025-01-18 18:50:38 +01:00
|
|
|
get_next_sibling = function( self )
|
2025-01-18 13:03:01 +01:00
|
|
|
if not self.parent then return nil end
|
|
|
|
|
|
|
|
local found_self = false
|
|
|
|
for _, child in ipairs(self.parent.children) do
|
|
|
|
if found_self then
|
|
|
|
return child
|
|
|
|
end
|
|
|
|
|
|
|
|
if child == self then
|
|
|
|
found_self = true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
return nil
|
|
|
|
end,
|
2025-01-18 18:52:17 +01:00
|
|
|
|
|
|
|
check_simple_selector = function( self, selector )
|
|
|
|
return M.check_simple_selector( self, selector )
|
|
|
|
end,
|
|
|
|
|
|
|
|
foreach = function( self, fn )
|
|
|
|
fn( self )
|
|
|
|
|
|
|
|
for _, child in ipairs(self.children or {}) do
|
|
|
|
child:foreach( fn )
|
|
|
|
end
|
2025-01-20 17:05:04 +01:00
|
|
|
end,
|
|
|
|
|
|
|
|
inner_text = function(self)
|
|
|
|
if self.tag_name == ":text" then
|
|
|
|
return self.content
|
|
|
|
end
|
|
|
|
|
|
|
|
local text = ""
|
|
|
|
for _, child in ipairs(self.children) do
|
|
|
|
text = text .. child:inner_text()
|
|
|
|
|
|
|
|
if not INLINE_TAGS[child.tag_name] then
|
|
|
|
text = text .. "\n"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
return text
|
2025-01-25 17:31:09 +01:00
|
|
|
end,
|
2025-02-05 13:50:15 +01:00
|
|
|
|
|
|
|
inner_markdown = function(self, in_pre, root_call)
|
|
|
|
in_pre = in_pre or false
|
|
|
|
root_call = root_call or true
|
|
|
|
|
|
|
|
if self.tag_name == "script" or self.tag_name == "style" then
|
|
|
|
return ""
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
if self.tag_name == ":text" then
|
|
|
|
return self.content
|
|
|
|
end
|
|
|
|
|
|
|
|
local text = ""
|
|
|
|
local is_list_item = self.tag_name == "li"
|
|
|
|
local parent_is_ul = self.parent and self.parent.tag_name == "ul"
|
|
|
|
local parent_is_ol = self.parent and self.parent.tag_name == "ol"
|
|
|
|
|
|
|
|
local is_heading = self.tag_name:match("^h[1-6]$")
|
|
|
|
local is_pre = self.tag_name == "pre"
|
|
|
|
|
|
|
|
if is_heading then
|
|
|
|
local level = tonumber(self.tag_name:sub(2))
|
|
|
|
text = "\n" .. string.rep("#", level) .. " "
|
|
|
|
end
|
|
|
|
|
|
|
|
if is_list_item then
|
|
|
|
if parent_is_ul then
|
|
|
|
text = "* "
|
|
|
|
elseif parent_is_ol then
|
|
|
|
local position = self:get_child_index()
|
|
|
|
text = position .. ". "
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
-- Process children
|
|
|
|
local inner = ""
|
|
|
|
for _, child in ipairs(self.children) do
|
|
|
|
inner = inner .. child:inner_markdown(false, in_pre or is_pre or false)
|
|
|
|
end
|
|
|
|
|
|
|
|
if self.tag_name == "br" then
|
|
|
|
text = text .. "\n" .. inner
|
|
|
|
elseif is_pre then
|
|
|
|
text = text .. inner
|
|
|
|
elseif is_heading then
|
|
|
|
text = text .. normalize_whitespace(inner)
|
|
|
|
elseif self.tag_name == "strong" then
|
|
|
|
text = text .. "**" .. normalize_whitespace(inner) .. "**"
|
|
|
|
elseif self.tag_name == "em" then
|
|
|
|
text = text .. "_" .. normalize_whitespace(inner) .. "_"
|
|
|
|
elseif self.tag_name == "code" then
|
|
|
|
local is_block = self.parent and self.parent.tag_name == "pre"
|
|
|
|
if is_block then
|
|
|
|
text = text .. "\n```\n" .. inner .. "\n```\n"
|
|
|
|
else
|
|
|
|
text = text .. "`" .. normalize_whitespace(inner) .. "`"
|
|
|
|
end
|
|
|
|
elseif self.tag_name == "a" then
|
|
|
|
text = text .. "[" .. normalize_whitespace(inner) .. "]"
|
|
|
|
|
|
|
|
if self.attributes.href then
|
|
|
|
text = text .. "(" .. self.attributes.href .. ")"
|
|
|
|
end
|
|
|
|
else
|
|
|
|
text = text .. inner
|
|
|
|
end
|
|
|
|
|
|
|
|
-- Add newlines after block elements
|
|
|
|
if not INLINE_TAGS[self.tag_name] then
|
|
|
|
text = text .. "\n"
|
|
|
|
end
|
|
|
|
|
|
|
|
if root_call then
|
|
|
|
-- Step 1: Remove whitespace between newlines
|
|
|
|
text = text:gsub("(\n)%s+(\n)", "%1%2")
|
|
|
|
-- Step 2: Replace 3+ consecutive newlines with just two
|
|
|
|
text = text:gsub("\n\n\n+", "\n\n")
|
|
|
|
end
|
|
|
|
|
|
|
|
return text
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if parent_elem then
|
|
|
|
table.insert( parent_elem.children, o )
|
|
|
|
end
|
|
|
|
|
|
|
|
local mt = {
|
|
|
|
__newindex = function(table, key, value)
|
|
|
|
-- Allow modification of existing attributes
|
|
|
|
if rawget(table.attributes, key) ~= nil then
|
|
|
|
rawset(table.attributes, key, value)
|
|
|
|
else
|
|
|
|
-- Prevent adding new attributes
|
|
|
|
error("Cannot add new attribute to DOM element: " .. tostring(key))
|
|
|
|
end
|
|
|
|
end,
|
|
|
|
__index = function(table, key)
|
|
|
|
-- Allow access to attributes
|
|
|
|
return rawget(table.attributes, key)
|
|
|
|
end
|
|
|
|
}
|
|
|
|
|
|
|
|
setmetatable(o, mt)
|
|
|
|
return o
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function M.preprocess( content )
|
|
|
|
-- remove "self closing" slashes as they MUST be ignored (spec)
|
|
|
|
-- and would cause problems
|
|
|
|
content = content:gsub("/%s*>", ">")
|
|
|
|
-- remove whitespace at the start of "</closing>" tags.
|
|
|
|
content = content:gsub("</%s*/%s*", "</")
|
|
|
|
|
|
|
|
return content
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
function M.tokenise( content )
|
|
|
|
local TOKENS = {}
|
|
|
|
|
|
|
|
-- state
|
2025-02-05 13:50:15 +01:00
|
|
|
local in_tag = nil
|
2025-01-14 19:48:32 +01:00
|
|
|
local currently_opened_quotes = nil
|
|
|
|
local text_memory = ""
|
|
|
|
|
|
|
|
local i = 1
|
|
|
|
|
|
|
|
while i <= #content do
|
|
|
|
local char = content:sub(i,i)
|
|
|
|
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
--
|
|
|
|
-- Taking care of quotes
|
|
|
|
--
|
|
|
|
if in_tag then
|
|
|
|
-- finding matching quotes
|
|
|
|
if currently_opened_quotes ~= nil and char == currently_opened_quotes then
|
|
|
|
currently_opened_quotes = nil
|
|
|
|
text_memory = text_memory .. char
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
-- Opening a new set of quotes
|
|
|
|
if currently_opened_quotes == nil and (char == "'" or char == '"') then
|
|
|
|
currently_opened_quotes = char
|
|
|
|
text_memory = text_memory .. char
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
-- reaching here means:
|
|
|
|
-- - we're in a tag, inside quotes
|
|
|
|
-- - the character is not the closing quote mark
|
|
|
|
-- So just add it and get on with it.
|
|
|
|
if currently_opened_quotes ~= nil then
|
|
|
|
text_memory = text_memory .. char
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
if char == "<" then
|
|
|
|
if content:sub(i, i+3) == "<!--" then
|
2025-01-25 17:05:40 +01:00
|
|
|
local end_i = content:find("-->", i+3, true)
|
|
|
|
if end_i then
|
|
|
|
i = end_i + 2
|
|
|
|
else
|
|
|
|
i = #content
|
|
|
|
end
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
if content:sub(i, i+1) == "<!" then
|
2025-01-25 17:31:09 +01:00
|
|
|
i = content:find(">", i, true)
|
2025-01-14 19:48:32 +01:00
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
---------------------------------
|
|
|
|
if #text_memory ~= 0 then
|
|
|
|
table.insert( TOKENS, {type="TEXT", value=text_memory} )
|
|
|
|
text_memory = ""
|
|
|
|
end
|
|
|
|
|
|
|
|
-- closing tag
|
|
|
|
if content:sub(i, i+1) == "</" then
|
|
|
|
table.insert( TOKENS, {type="START_CLOSING_TAG"} )
|
2025-02-05 13:50:15 +01:00
|
|
|
in_tag = "closing"
|
2025-01-14 19:48:32 +01:00
|
|
|
i = i+1
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
table.insert( TOKENS, {type="START_OPENING_TAG"} )
|
2025-02-05 13:50:15 +01:00
|
|
|
in_tag = "opening"
|
2025-01-14 19:48:32 +01:00
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
|
|
|
|
if char == ">" and in_tag then
|
|
|
|
-- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word"
|
2025-01-14 19:48:32 +01:00
|
|
|
if #text_memory ~= 0 then
|
2025-02-05 13:50:15 +01:00
|
|
|
local word = trim(text_memory)
|
|
|
|
if not word:match("^%s*$") then
|
|
|
|
table.insert( TOKENS, {type="WORD", value=word})
|
|
|
|
end
|
|
|
|
text_memory = ""
|
|
|
|
end
|
|
|
|
|
|
|
|
table.insert( TOKENS, {type = "END_TAG"} )
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
-- closing tags don't require any more work.
|
|
|
|
if in_tag == "closing" then
|
|
|
|
in_tag = nil
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
in_tag = nil
|
|
|
|
|
|
|
|
local curr_token = #TOKENS
|
|
|
|
while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
|
|
|
|
curr_token = curr_token - 1
|
|
|
|
end
|
|
|
|
curr_token = curr_token + 1
|
|
|
|
|
|
|
|
if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then
|
|
|
|
error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
|
|
|
|
end
|
|
|
|
|
|
|
|
local tagname = TOKENS[curr_token].value
|
|
|
|
|
|
|
|
if RAW_TEXT_TAGS[tagname] then
|
|
|
|
local end_tag = (content:find("</"..tagname, i, true) or 0) - 1
|
|
|
|
if end_tag < 1 then
|
|
|
|
logger.printerr("Can't find closing " .. tagname .. "!")
|
|
|
|
print(content:sub(i))
|
|
|
|
os.exit(-5)
|
|
|
|
end
|
|
|
|
local text_content = content:sub(i+1, end_tag)
|
|
|
|
|
|
|
|
-- special handling of pre
|
|
|
|
if tagname == "pre" then
|
|
|
|
-- check if it "looks" like HTML
|
|
|
|
if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
|
|
|
|
-- tokenise the inner text
|
|
|
|
local text_tokens = M.tokenise( text_content )
|
|
|
|
|
|
|
|
-- and add it to the current token list
|
|
|
|
for _, tok in ipairs(text_tokens) do
|
|
|
|
table.insert( TOKENS, tok )
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
i = end_tag
|
|
|
|
goto continue
|
2025-01-19 14:00:13 +01:00
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
2025-02-05 13:50:15 +01:00
|
|
|
-- treat the rest as text
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
i = end_tag
|
|
|
|
table.insert( TOKENS, {type="TEXT", value=text_content} )
|
|
|
|
goto continue
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
----------------------------------------------------
|
|
|
|
--- "OLD", UNCHECKED CODE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-- if char == ">" and in_tag and currently_opened_quotes == nil then
|
|
|
|
-- if #text_memory ~= 0 then
|
|
|
|
-- local word = trim(text_memory)
|
|
|
|
-- if not word:match("^%s*$") then
|
|
|
|
-- table.insert( TOKENS, {type="WORD", value=word})
|
|
|
|
-- end
|
|
|
|
-- text_memory = ""
|
|
|
|
-- end
|
|
|
|
|
|
|
|
|
|
|
|
-- table.insert( TOKENS, {type = "END_TAG"} )
|
|
|
|
|
|
|
|
|
|
|
|
-- local curr_token = #TOKENS
|
|
|
|
-- while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do
|
|
|
|
-- curr_token = curr_token - 1
|
|
|
|
-- end
|
|
|
|
-- curr_token = curr_token + 1
|
|
|
|
-- if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then
|
|
|
|
-- error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- if TOKENS[curr_token].type == "START_CLOSING_TAG" then
|
|
|
|
-- goto continue
|
|
|
|
-- end
|
|
|
|
|
|
|
|
|
|
|
|
-- local tagname = TOKENS[curr_token+1].value
|
|
|
|
|
|
|
|
-- if RAW_TEXT_TAGS[tagname] then
|
|
|
|
-- logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.")
|
|
|
|
|
|
|
|
-- print(content:sub(1,i-1))
|
|
|
|
-- print(("="):rep(40))
|
|
|
|
-- print(content:sub(i))
|
|
|
|
|
|
|
|
-- local end_tag = content:find("</"..tagname, i, true) - 1
|
|
|
|
-- local text_content = content:sub(i+1, end_tag)
|
|
|
|
|
|
|
|
-- if tagname == "pre" and false then
|
|
|
|
-- -- check if it "looks" like HTML
|
|
|
|
-- if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
|
|
|
|
-- -- tokenise the inner text
|
|
|
|
-- local text_tokens = M.tokenise( text_content )
|
|
|
|
|
|
|
|
-- -- and add it to the current token list
|
|
|
|
-- for _, tok in ipairs(text_tokens) do
|
|
|
|
-- if tok.value == nil then
|
|
|
|
-- print( "\t::: " .. tok.type )
|
|
|
|
-- else
|
|
|
|
-- print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) )
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- table.insert( TOKENS, tok )
|
|
|
|
-- end
|
|
|
|
-- else
|
|
|
|
-- -- treat it as text
|
|
|
|
-- table.insert( TOKENS, {type="TEXT", value=text_content} )
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- i = end_tag
|
|
|
|
-- end
|
|
|
|
|
|
|
|
|
|
|
|
-- in_tag = false
|
|
|
|
-- goto continue
|
|
|
|
-- end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-- if #text_memory ~= 0 then
|
|
|
|
-- if in_tag and currently_opened_quotes == nil then
|
|
|
|
-- local word = trim(text_memory)
|
|
|
|
|
|
|
|
-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
|
|
|
|
-- if RAW_TEXT_TAGS[word] then
|
|
|
|
-- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
|
|
|
-- -- made possible because of the whitespace removal at the start
|
|
|
|
-- i = content:find("</"..word, i, true) - 1
|
|
|
|
-- end
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- if not word:match("^%s*$") then
|
|
|
|
-- table.insert( TOKENS, {type="WORD", value=word})
|
|
|
|
-- end
|
|
|
|
-- else
|
|
|
|
-- table.insert( TOKENS, {type="TEXT", value=text_memory} )
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- text_memory = ""
|
|
|
|
-- end
|
|
|
|
|
|
|
|
-- in_tag = false
|
|
|
|
-- table.insert( TOKENS, {type = "END_TAG"} )
|
|
|
|
|
|
|
|
-- goto continue
|
|
|
|
-- end
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
if in_tag then
|
|
|
|
if currently_opened_quotes == nil and char:match("%s") then
|
|
|
|
if #text_memory ~= 0 then
|
|
|
|
local word = trim(text_memory)
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
|
|
|
|
-- if RAW_TEXT_TAGS[word] then
|
|
|
|
-- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
|
|
|
-- text_memory = ""
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
-- -- advance to closing ">"
|
|
|
|
-- i = content:find(">", i, true)
|
|
|
|
-- -- made possible because of the whitespace removal at the start
|
|
|
|
-- i = content:find("</"..word, i, true) - 1
|
|
|
|
-- end
|
|
|
|
-- end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-01-19 14:00:13 +01:00
|
|
|
if not word:match("^%s*$") then
|
|
|
|
table.insert( TOKENS, {type="WORD", value=word})
|
|
|
|
text_memory = ""
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
-- if char == "'" or char == '"' then
|
|
|
|
-- -- found matching closing quote type
|
|
|
|
-- if char == currently_opened_quotes then
|
|
|
|
-- currently_opened_quotes = nil
|
|
|
|
-- elseif currently_opened_quotes == nil then
|
|
|
|
-- currently_opened_quotes = char
|
|
|
|
-- end
|
|
|
|
-- end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
text_memory = text_memory .. char
|
|
|
|
goto continue
|
|
|
|
else
|
|
|
|
text_memory = text_memory .. char
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
::continue::
|
|
|
|
i = i+1
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
return TOKENS
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2025-01-18 18:52:17 +01:00
|
|
|
function M.check_simple_selector(element, selector)
|
2025-01-20 17:05:29 +01:00
|
|
|
-- Skip text nodes
|
|
|
|
if element.tag_name == ":text" then
|
|
|
|
return false
|
|
|
|
end
|
2025-01-18 18:52:17 +01:00
|
|
|
|
2025-01-20 17:05:29 +01:00
|
|
|
-- Check tag name if specified
|
|
|
|
if selector.tag_name and element.tag_name ~= selector.tag_name then
|
|
|
|
return false
|
|
|
|
end
|
2025-01-18 18:52:17 +01:00
|
|
|
|
2025-01-20 17:05:29 +01:00
|
|
|
-- Check ID if specified
|
|
|
|
if selector.id and element.attributes.id ~= selector.id then
|
|
|
|
return false
|
|
|
|
end
|
|
|
|
|
|
|
|
-- Check classes if specified
|
|
|
|
if selector.class and #selector.class > 0 then
|
|
|
|
local element_classes = element.attributes.class
|
|
|
|
if not element_classes then
|
2025-01-18 18:52:17 +01:00
|
|
|
return false
|
|
|
|
end
|
|
|
|
|
2025-01-20 17:05:29 +01:00
|
|
|
for _, class in ipairs(selector.class) do
|
|
|
|
local found = false
|
|
|
|
for _, elem_class in ipairs(element_classes) do
|
|
|
|
if elem_class == class then
|
|
|
|
found = true
|
|
|
|
break
|
|
|
|
end
|
|
|
|
end
|
|
|
|
if not found then
|
2025-01-18 18:52:17 +01:00
|
|
|
return false
|
|
|
|
end
|
2025-01-20 17:05:29 +01:00
|
|
|
end
|
|
|
|
end
|
2025-01-18 18:52:17 +01:00
|
|
|
|
2025-01-20 17:05:29 +01:00
|
|
|
for attr_name, attr_value in pairs(selector.attributes_values) do
|
|
|
|
local elem_attr_value = element.attributes[attr_name]
|
|
|
|
if elem_attr_value ~= attr_value then
|
|
|
|
return false
|
2025-01-18 18:52:17 +01:00
|
|
|
end
|
2025-01-20 17:05:29 +01:00
|
|
|
end
|
2025-01-18 18:52:17 +01:00
|
|
|
|
2025-01-20 17:05:29 +01:00
|
|
|
-- Check attribute presence selectors
|
|
|
|
for _, attr_name in ipairs(selector.attributes_present) do
|
|
|
|
if not element.attributes[attr_name] then
|
|
|
|
return false
|
|
|
|
end
|
2025-01-18 18:52:17 +01:00
|
|
|
end
|
|
|
|
|
2025-01-20 17:05:29 +01:00
|
|
|
return true
|
|
|
|
end
|
|
|
|
|
2025-01-18 18:52:17 +01:00
|
|
|
function M.query_simple_selector(document, selector)
|
|
|
|
local matches = {}
|
|
|
|
|
|
|
|
local function traverse(node)
|
|
|
|
if M.check_simple_selector(node, selector) then
|
|
|
|
table.insert(matches, node)
|
|
|
|
end
|
|
|
|
|
|
|
|
for _, child in ipairs(node.children) do
|
|
|
|
traverse(child)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
traverse(document)
|
|
|
|
return matches
|
|
|
|
end
|
|
|
|
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
function M.parse_tokens_into_document( TOKENS )
|
|
|
|
local DOCUMENT = M.make_dom_element(nil, nil)
|
|
|
|
local current_doc_element = DOCUMENT
|
|
|
|
local in_opening_tag_for = nil
|
|
|
|
|
|
|
|
local i = 1
|
|
|
|
while i <= #TOKENS do
|
|
|
|
local token = TOKENS[i]
|
|
|
|
|
|
|
|
if token.type == "WORD" then
|
2025-01-18 13:14:05 +01:00
|
|
|
if current_doc_element.tag_name == ":text" then
|
2025-01-14 19:48:32 +01:00
|
|
|
current_doc_element = current_doc_element.parent
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
if i > 0 and TOKENS[i-1].type == "START_OPENING_TAG" then
|
|
|
|
local new_elem = M.make_dom_element( token.value, current_doc_element )
|
|
|
|
current_doc_element = new_elem
|
|
|
|
in_opening_tag_for = token.value
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
if i > 0 and TOKENS[i-1].type == "START_CLOSING_TAG" then
|
|
|
|
local curr_elem = current_doc_element
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
-- If we find a closing tag, check if:
|
|
|
|
-- - That tag is a void tag (childless, auto-closing)
|
|
|
|
-- - The last child added to the current element is that tag
|
|
|
|
--
|
|
|
|
-- This avoids having <img> tags as parents of <p> tags for example
|
|
|
|
local last_child = curr_elem.children[#curr_elem.children]
|
|
|
|
if last_child and VOID_TAGS[last_child.tag_name] and last_child.tag_name == token.value then
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
while curr_elem.parent and curr_elem.tag_name ~= token.value do
|
|
|
|
curr_elem = curr_elem.parent
|
|
|
|
end
|
|
|
|
|
|
|
|
if curr_elem.parent == nil then
|
|
|
|
-- reached DOCUMENT root
|
2025-01-19 14:16:42 +01:00
|
|
|
logger.printerr("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
|
2025-01-14 19:48:32 +01:00
|
|
|
current_doc_element = DOCUMENT
|
|
|
|
else
|
|
|
|
current_doc_element = curr_elem.parent
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if in_opening_tag_for then
|
2025-02-05 13:50:15 +01:00
|
|
|
local pattern = "([^=]+)=['\"](.+)['\"]"
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
local name, raw_value = token.value:match(pattern)
|
|
|
|
|
|
|
|
if name == nil or raw_value == nil then
|
|
|
|
name = token.value:match("([%w-]+)")
|
|
|
|
|
|
|
|
if name == nil then
|
2025-01-19 14:00:13 +01:00
|
|
|
error("Unrecognised word: " .. tostring(name) .. " (Token ".. tostring(i) .." , type=" .. tostring(token.type) .. ", value=" .. tostring(token.value) .. ")")
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
current_doc_element.attributes[name] = true
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
local value = nil
|
|
|
|
if raw_value == "" or raw_value == nil then
|
|
|
|
value = nil
|
|
|
|
else
|
|
|
|
value = trim(raw_value)
|
2025-01-18 15:45:11 +01:00
|
|
|
|
|
|
|
if name == "class" then
|
|
|
|
local classes = {}
|
|
|
|
|
|
|
|
for class in value:gmatch("%S+") do
|
|
|
|
table.insert( classes, class )
|
|
|
|
end
|
|
|
|
|
|
|
|
value = classes
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
current_doc_element.attributes[name] = value
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
if token.type == "END_TAG" then
|
|
|
|
if in_opening_tag_for then
|
|
|
|
if VOID_TAGS[in_opening_tag_for] then
|
|
|
|
if current_doc_element.parent == nil then
|
|
|
|
-- reached DOCUMENT root
|
|
|
|
current_doc_element = DOCUMENT
|
|
|
|
else
|
|
|
|
current_doc_element = current_doc_element.parent
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
end
|
|
|
|
|
|
|
|
in_opening_tag_for = nil
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
if token.type == "TEXT" then
|
2025-01-18 13:14:05 +01:00
|
|
|
local new_elem = M.make_dom_element( ":text", current_doc_element )
|
2025-01-14 19:48:32 +01:00
|
|
|
new_elem.content = token.value
|
|
|
|
current_doc_element = new_elem
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
::continue::
|
|
|
|
i = i+1
|
|
|
|
end
|
|
|
|
|
2025-01-18 18:50:03 +01:00
|
|
|
M.clean_text_nodes( DOCUMENT )
|
2025-01-18 13:03:01 +01:00
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
return DOCUMENT
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
function M.clean_text_nodes(node)
|
2025-01-18 13:14:05 +01:00
|
|
|
if node.tag_name ~= ":text" then
|
2025-01-14 19:48:32 +01:00
|
|
|
-- Don't clean anything in raw text tags
|
|
|
|
if RAW_TEXT_TAGS[node.tag_name] then
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
2025-01-18 18:50:03 +01:00
|
|
|
for _, child in ipairs( shallow_copy(node.children) ) do
|
2025-01-14 19:48:32 +01:00
|
|
|
M.clean_text_nodes( child )
|
|
|
|
end
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
-- purge content-less text nodes
|
|
|
|
if #trim(node.content) == 0 then
|
|
|
|
if not node.parent then
|
|
|
|
error("Text node without a parent; should be impossible !")
|
|
|
|
end
|
|
|
|
|
2025-01-18 18:50:03 +01:00
|
|
|
for i, child in ipairs( shallow_copy(node.parent.children) ) do
|
2025-01-14 19:48:32 +01:00
|
|
|
if child == node then
|
|
|
|
table.remove( node.parent.children, i )
|
|
|
|
break
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
return
|
|
|
|
end
|
|
|
|
|
2025-01-20 17:05:04 +01:00
|
|
|
node.content = node.content:gsub("%s+", " ")
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
|
2025-01-19 13:47:54 +01:00
|
|
|
function M._tostring(node, indent, include_internal_pseudoelements)
|
2025-01-14 19:48:32 +01:00
|
|
|
-- Default indentation is 0 (root level)
|
|
|
|
indent = indent or 0
|
2025-01-18 18:52:52 +01:00
|
|
|
include_internal_pseudoelements = include_internal_pseudoelements or false
|
|
|
|
|
|
|
|
local is_pseudo_element = (node.tag_name or ":root"):sub(1,1) == ":"
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
local indent_level_str = " "
|
|
|
|
-- Create the indentation string (e.g., " " for each level)
|
|
|
|
local indent_str = string.rep(indent_level_str, indent)
|
|
|
|
|
2025-01-18 13:14:05 +01:00
|
|
|
if node.tag_name == ":text" then
|
2025-01-18 18:52:52 +01:00
|
|
|
local str = ""
|
|
|
|
|
|
|
|
if include_internal_pseudoelements then
|
|
|
|
str = str .. "<:text>"
|
|
|
|
end
|
|
|
|
|
|
|
|
str = str .. node.content
|
|
|
|
|
|
|
|
if include_internal_pseudoelements then
|
|
|
|
str = str .. "</:text>"
|
|
|
|
end
|
|
|
|
|
|
|
|
return str
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
local node_name = ""
|
|
|
|
|
2025-01-18 18:52:52 +01:00
|
|
|
if not is_pseudo_element or include_internal_pseudoelements then
|
2025-01-20 17:18:02 +01:00
|
|
|
-- Print the current node's tag name
|
2025-01-18 18:52:52 +01:00
|
|
|
node_name = node_name .. "\n" .. indent_str .. "<" .. (node.tag_name or ":root")
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
-- Print attributes if any
|
|
|
|
if next(node.attributes) ~= nil then
|
|
|
|
for attr, value in pairs(node.attributes) do
|
2025-01-18 15:45:11 +01:00
|
|
|
if type(value) == "table" then
|
|
|
|
node_name = node_name .. " " .. attr .. "=\""
|
|
|
|
for i, val in ipairs( value ) do
|
|
|
|
if i > 1 then node_name = node_name .. " " end
|
|
|
|
node_name = node_name .. tostring(val)
|
|
|
|
end
|
|
|
|
node_name = node_name .. "\""
|
|
|
|
else
|
|
|
|
node_name = node_name .. " " .. attr .. "=\"" .. tostring(value) .. "\""
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2025-01-18 18:52:52 +01:00
|
|
|
if not is_pseudo_element or include_internal_pseudoelements then
|
|
|
|
node_name = node_name .. ">"
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-01-18 18:52:52 +01:00
|
|
|
local next_indent = indent + 1
|
|
|
|
if is_pseudo_element and not include_internal_pseudoelements then
|
|
|
|
next_indent = indent
|
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
-- Recursively print children
|
|
|
|
for _, child in ipairs(node.children) do
|
2025-01-19 13:47:54 +01:00
|
|
|
node_name = node_name .. M._tostring(child, next_indent, include_internal_pseudoelements)
|
2025-01-18 18:52:52 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
if not VOID_TAGS[node.tag_name] and ( not is_pseudo_element or include_internal_pseudoelements ) then
|
|
|
|
-- Print the closing tag
|
|
|
|
local end_indent = ""
|
|
|
|
local closing_text_tag = "</:text>"
|
|
|
|
if node_name:sub(#node_name, #node_name) == ">" and node_name:sub(#node_name - #closing_text_tag + 1, #node_name) ~= closing_text_tag then
|
|
|
|
end_indent = "\n" .. indent_str
|
|
|
|
end
|
|
|
|
node_name = node_name .. end_indent .. "</" .. (node.tag_name or ":root") .. ">"
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
2025-01-18 18:52:52 +01:00
|
|
|
return node_name
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
2025-01-19 13:47:54 +01:00
|
|
|
function M.tostring(node, base_indent, include_internal_pseudoelements)
|
|
|
|
return trim( M._tostring(node, base_indent, include_internal_pseudoelements) )
|
|
|
|
end
|
2025-01-18 13:14:05 +01:00
|
|
|
|
|
|
|
|
2025-01-18 18:52:52 +01:00
|
|
|
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
function M.parse( html_string )
|
|
|
|
local clean_html = M.preprocess( html_string )
|
|
|
|
|
|
|
|
local tokens = M.tokenise( clean_html )
|
|
|
|
|
|
|
|
local document = M.parse_tokens_into_document( tokens )
|
|
|
|
|
2025-01-18 13:03:01 +01:00
|
|
|
return document
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
return M
|