local logger = require(".logging") local function trim(str) return str:match("^%s*(.-)%s*$") end local function shallow_copy(t) local t2 = {} for k,v in pairs(t) do t2[k] = v end return t2 end local M = {} local RAW_TEXT_TAGS = { script = true, style = true, pre = true } -- void tags are content-less, or so-called "self-closing", tags local VOID_TAGS = { area = true, base = true, br = true, col = true, embed = true, hr = true, img = true, input = true, link = true, meta = true, param = true, -- deprecated source = true, track = true, wbr = true, } function M.make_dom_element( tag_name, parent_elem ) local o = { tag_name = tag_name, attributes = {}, content = "", children = {}, parent = parent_elem, get_child_index = function( self ) if not self.parent then return -1 end for i, child in ipairs(self.parent.children) do if child == self then return i end end end, get_next_sibling = function( self ) if not self.parent then return nil end local found_self = false for _, child in ipairs(self.parent.children) do if found_self then return child end if child == self then found_self = true end end return nil end, check_simple_selector = function( self, selector ) return M.check_simple_selector( self, selector ) end, foreach = function( self, fn ) fn( self ) for _, child in ipairs(self.children or {}) do child:foreach( fn ) end end } if parent_elem then table.insert( parent_elem.children, o ) end local mt = { __newindex = function(table, key, value) -- Allow modification of existing attributes if rawget(table.attributes, key) ~= nil then rawset(table.attributes, key, value) else -- Prevent adding new attributes error("Cannot add new attribute to DOM element: " .. tostring(key)) end end, __index = function(table, key) -- Allow access to attributes return rawget(table.attributes, key) end } setmetatable(o, mt) return o end function M.preprocess( content ) -- remove "self closing" slashes as they MUST be ignored (spec) -- and would cause problems content = content:gsub("/%s*>", ">") -- remove whitespace at the start of "" tags. content = content:gsub("", "after") goto continue end if content:sub(i, i+1) == "", i) goto continue end --------------------------------- if #text_memory ~= 0 then table.insert( TOKENS, {type="TEXT", value=text_memory} ) text_memory = "" end in_tag = true -- closing tag if content:sub(i, i+1) == "" then if #text_memory ~= 0 then if in_tag and currently_opened_quotes == nil then local word = trim(text_memory) if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then if RAW_TEXT_TAGS[word] then logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.") -- made possible because of the whitespace removal at the start set_skipping_to("" i = content:find(">", i) end end if not word:match("^%s*$") then table.insert( TOKENS, {type="WORD", value=word}) text_memory = "" end goto continue end end if char == "'" or char == '"' then if currently_opened_quotes == char then currently_opened_quotes = nil else currently_opened_quotes = char end text_memory = text_memory .. char goto continue end text_memory = text_memory .. char goto continue else text_memory = text_memory .. char goto continue end ::continue:: i = i+1 end return TOKENS end function M.check_simple_selector(element, selector) -- Skip text nodes if element.tag_name == ":text" then return false end -- Check tag name if specified if selector.tag_name and element.tag_name ~= selector.tag_name then return false end -- Check ID if specified if selector.id and element.attributes.id ~= selector.id then return false end -- Check classes if specified if selector.class and #selector.class > 0 then local element_classes = element.attributes.class if not element_classes then return false end for _, class in ipairs(selector.class) do local found = false for _, elem_class in ipairs(element_classes) do if elem_class == class then found = true break end end if not found then return false end end end return true end function M.query_simple_selector(document, selector) local matches = {} local function traverse(node) if M.check_simple_selector(node, selector) then table.insert(matches, node) end for _, child in ipairs(node.children) do traverse(child) end end traverse(document) return matches end function M.parse_tokens_into_document( TOKENS ) local DOCUMENT = M.make_dom_element(nil, nil) local current_doc_element = DOCUMENT local in_opening_tag_for = nil local i = 1 while i <= #TOKENS do local token = TOKENS[i] if token.type == "WORD" then if current_doc_element.tag_name == ":text" then current_doc_element = current_doc_element.parent end if i > 0 and TOKENS[i-1].type == "START_OPENING_TAG" then local new_elem = M.make_dom_element( token.value, current_doc_element ) current_doc_element = new_elem in_opening_tag_for = token.value goto continue end if i > 0 and TOKENS[i-1].type == "START_CLOSING_TAG" then local curr_elem = current_doc_element while curr_elem.parent and curr_elem.tag_name ~= token.value do curr_elem = curr_elem.parent end if curr_elem.parent == nil then -- reached DOCUMENT root logger.printerr("Warning: reached document root while trying to match for closing " .. token.value .. " token.") current_doc_element = DOCUMENT else current_doc_element = curr_elem.parent end goto continue end if in_opening_tag_for then local pattern = "([%w-]+)=['\"](.-)['\"]" local name, raw_value = token.value:match(pattern) if name == nil or raw_value == nil then name = token.value:match("([%w-]+)") if name == nil then error("Unrecognised word: " .. tostring(name) .. " (Token ".. tostring(i) .." , type=" .. tostring(token.type) .. ", value=" .. tostring(token.value) .. ")") end current_doc_element.attributes[name] = true goto continue end local value = nil if raw_value == "" or raw_value == nil then value = nil else value = trim(raw_value) if name == "class" then local classes = {} for class in value:gmatch("%S+") do table.insert( classes, class ) end value = classes end end current_doc_element.attributes[name] = value goto continue end end if token.type == "END_TAG" then if in_opening_tag_for then if VOID_TAGS[in_opening_tag_for] then if current_doc_element.parent == nil then -- reached DOCUMENT root current_doc_element = DOCUMENT else current_doc_element = current_doc_element.parent end end end in_opening_tag_for = nil goto continue end if token.type == "TEXT" then local new_elem = M.make_dom_element( ":text", current_doc_element ) new_elem.content = token.value current_doc_element = new_elem goto continue end ::continue:: i = i+1 end M.clean_text_nodes( DOCUMENT ) return DOCUMENT end function M.clean_text_nodes(node) if node.tag_name ~= ":text" then -- Don't clean anything in raw text tags if RAW_TEXT_TAGS[node.tag_name] then return end for _, child in ipairs( shallow_copy(node.children) ) do M.clean_text_nodes( child ) end return end -- purge content-less text nodes if #trim(node.content) == 0 then if not node.parent then error("Text node without a parent; should be impossible !") end for i, child in ipairs( shallow_copy(node.parent.children) ) do if child == node then table.remove( node.parent.children, i ) break end end return end node.content = trim( node.content:gsub("%s+", " ") ) end function M._tostring(node, indent, include_internal_pseudoelements) -- Default indentation is 0 (root level) indent = indent or 0 include_internal_pseudoelements = include_internal_pseudoelements or false local is_pseudo_element = (node.tag_name or ":root"):sub(1,1) == ":" local indent_level_str = " " -- Create the indentation string (e.g., " " for each level) local indent_str = string.rep(indent_level_str, indent) if node.tag_name == ":text" then local str = "" if include_internal_pseudoelements then str = str .. "<:text>" end str = str .. node.content if include_internal_pseudoelements then str = str .. "" end return str end local node_name = "" if not is_pseudo_element or include_internal_pseudoelements then -- Print the current node's tag name node_name = node_name .. "\n" .. indent_str .. "<" .. (node.tag_name or ":root") end -- Print attributes if any if next(node.attributes) ~= nil then for attr, value in pairs(node.attributes) do if type(value) == "table" then node_name = node_name .. " " .. attr .. "=\"" for i, val in ipairs( value ) do if i > 1 then node_name = node_name .. " " end node_name = node_name .. tostring(val) end node_name = node_name .. "\"" else node_name = node_name .. " " .. attr .. "=\"" .. tostring(value) .. "\"" end end end if not is_pseudo_element or include_internal_pseudoelements then node_name = node_name .. ">" end local next_indent = indent + 1 if is_pseudo_element and not include_internal_pseudoelements then next_indent = indent end -- Recursively print children for _, child in ipairs(node.children) do node_name = node_name .. M._tostring(child, next_indent, include_internal_pseudoelements) end if not VOID_TAGS[node.tag_name] and ( not is_pseudo_element or include_internal_pseudoelements ) then -- Print the closing tag local end_indent = "" local closing_text_tag = "" if node_name:sub(#node_name, #node_name) == ">" and node_name:sub(#node_name - #closing_text_tag + 1, #node_name) ~= closing_text_tag then end_indent = "\n" .. indent_str end node_name = node_name .. end_indent .. "" end return node_name end function M.tostring(node, base_indent, include_internal_pseudoelements) return trim( M._tostring(node, base_indent, include_internal_pseudoelements) ) end function M.parse( html_string ) local clean_html = M.preprocess( html_string ) local tokens = M.tokenise( clean_html ) local document = M.parse_tokens_into_document( tokens ) return document end return M