feat: add --markdown mode, + minor fixes

2025-02-05 13:50:15 +01:00 · 2025-02-05 13:50:15 +01:00 · 5e8c023559
parent 59585919fb
commit 5e8c023559
2 changed files with 345 additions and 41 deletions
--- a/html.lua
+++ b/html.lua
@ -95,6 +95,9 @@ local INLINE_TAGS = {
 	template = true,
 }

+local function normalize_whitespace(str)
+	return str:gsub("%s+", " ")
+end

 function M.make_dom_element( tag_name, parent_elem )
 	local o = {
@ -160,6 +163,90 @@ function M.make_dom_element( tag_name, parent_elem )

 			return text
 		end,
+
+		inner_markdown = function(self, in_pre, root_call)
+			in_pre = in_pre or false
+			root_call = root_call or true
+
+			if self.tag_name == "script" or self.tag_name == "style" then
+				return ""
+			end
+
+
+			if self.tag_name == ":text" then
+				return self.content
+			end
+
+			local text = ""
+			local is_list_item = self.tag_name == "li"
+			local parent_is_ul = self.parent and self.parent.tag_name == "ul"
+			local parent_is_ol = self.parent and self.parent.tag_name == "ol"
+
+			local is_heading = self.tag_name:match("^h[1-6]$")
+			local is_pre = self.tag_name == "pre"
+
+			if is_heading then
+				local level = tonumber(self.tag_name:sub(2))
+				text = "\n" .. string.rep("#", level) .. " "
+			end
+
+			if is_list_item then
+				if parent_is_ul then
+					text = "* "
+				elseif parent_is_ol then
+					local position = self:get_child_index()
+					text = position .. ". "
+				end
+			end
+
+
+			-- Process children
+			local inner = ""
+			for _, child in ipairs(self.children) do
+				inner = inner .. child:inner_markdown(false, in_pre or is_pre or false)
+			end
+
+			if self.tag_name == "br" then
+				text = text .. "\n" .. inner
+			elseif is_pre then
+				text = text .. inner
+			elseif is_heading then
+				text = text .. normalize_whitespace(inner)
+			elseif self.tag_name == "strong" then
+				text = text .. "**" .. normalize_whitespace(inner) .. "**"
+			elseif self.tag_name == "em" then
+				text = text .. "_" .. normalize_whitespace(inner) .. "_"
+			elseif self.tag_name == "code" then
+				local is_block = self.parent and self.parent.tag_name == "pre"
+				if is_block then
+					text = text .. "\n```\n" .. inner .. "\n```\n"
+				else
+					text = text .. "`" .. normalize_whitespace(inner) .. "`"
+				end
+			elseif self.tag_name == "a" then
+				text = text .. "[" .. normalize_whitespace(inner) .. "]"
+
+				if self.attributes.href then
+					text = text .. "(" .. self.attributes.href .. ")"
+				end
+			else
+				text = text .. inner
+			end
+
+			-- Add newlines after block elements
+			if not INLINE_TAGS[self.tag_name] then
+				text = text .. "\n"
+			end
+
+			if root_call then
+				-- Step 1: Remove whitespace between newlines
+				text = text:gsub("(\n)%s+(\n)", "%1%2")
+				-- Step 2: Replace 3+ consecutive newlines with just two
+				text = text:gsub("\n\n\n+", "\n\n")
+			end
+
+			return text
+		end
 	}

 	if parent_elem then
@ -203,7 +290,7 @@ function M.tokenise( content )
 	local TOKENS = {}

 	-- state
-	local in_tag = false
+	local in_tag = nil
 	local currently_opened_quotes = nil
 	local text_memory = ""

@ -213,6 +300,35 @@ function M.tokenise( content )
 		local char = content:sub(i,i)


+		--
+		-- Taking care of quotes
+		--
+		if in_tag then
+			-- finding matching quotes
+			if currently_opened_quotes ~= nil and char == currently_opened_quotes then
+				currently_opened_quotes = nil
+				text_memory = text_memory .. char
+				goto continue
+			end
+
+			-- Opening a new set of quotes
+			if currently_opened_quotes == nil and (char == "'" or char == '"') then
+				currently_opened_quotes = char
+				text_memory = text_memory .. char
+				goto continue
+			end
+
+			-- reaching here means:
+			-- - we're in a tag, inside quotes
+			-- - the character is not the closing quote mark
+			-- So just add it and get on with it.
+			if currently_opened_quotes ~= nil then
+				text_memory = text_memory .. char
+				goto continue
+			end
+		end
+
+

 		if char == "<" then
 			if content:sub(i, i+3) == "<!--" then
@ -237,66 +353,220 @@ function M.tokenise( content )
 				text_memory = ""
 			end

-			in_tag = true
-
 			-- closing tag
 			if content:sub(i, i+1) == "</" then
 				table.insert( TOKENS, {type="START_CLOSING_TAG"} )
+				in_tag = "closing"
 				i = i+1
 				goto continue
 			end

 			table.insert( TOKENS, {type="START_OPENING_TAG"} )
+			in_tag = "opening"
 			goto continue
 		end

-		if char == ">" then
+
+		if char == ">" and in_tag then
+			-- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word"
 			if #text_memory ~= 0 then
-				if in_tag and currently_opened_quotes == nil then
 				local word = trim(text_memory)
-
-					if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
-						if RAW_TEXT_TAGS[word] then
-							logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
-							-- made possible because of the whitespace removal at the start
-							i = content:find("</"..word, i, true) - 1
-						end
-					end
-
 				if not word:match("^%s*$") then
 					table.insert( TOKENS, {type="WORD", value=word})
 				end
-				else
-					table.insert( TOKENS, {type="TEXT", value=text_memory} )
-				end
-
 				text_memory = ""
 			end

-			in_tag = false
 			table.insert( TOKENS, {type = "END_TAG"} )

+			-- closing tags don't require any more work.
+			if in_tag == "closing" then
+				in_tag = nil
+				goto continue
+			end
+			in_tag = nil
+
+			local curr_token = #TOKENS
+			while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
+				curr_token = curr_token - 1
+			end
+			curr_token = curr_token + 1
+
+			if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then
+				error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
+			end
+
+			local tagname = TOKENS[curr_token].value
+
+			if RAW_TEXT_TAGS[tagname] then
+				local end_tag = (content:find("</"..tagname, i, true) or 0) - 1
+				if end_tag < 1 then
+					logger.printerr("Can't find closing " .. tagname .. "!")
+					print(content:sub(i))
+					os.exit(-5)
+				end
+				local text_content = content:sub(i+1, end_tag)
+
+				-- special handling of pre
+				if tagname == "pre" then
+					-- check if it "looks" like HTML
+					if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
+						-- tokenise the inner text
+						local text_tokens = M.tokenise( text_content )
+
+						-- and add it to the current token list
+						for _, tok in ipairs(text_tokens) do
+							table.insert( TOKENS, tok )
+						end
+
+						i = end_tag
+						goto continue
+					end
+				end
+				-- treat the rest as text
+
+				i = end_tag
+				table.insert( TOKENS, {type="TEXT", value=text_content} )
+				goto continue
+			end
+
+
+
+
 			goto continue
 		end



+
+
+
+		----------------------------------------------------
+		---  "OLD", UNCHECKED CODE
+
+
+
+
+
+
+		-- if char == ">" and in_tag and currently_opened_quotes == nil then
+		-- 	if #text_memory ~= 0 then
+		-- 		local word = trim(text_memory)
+		-- 		if not word:match("^%s*$") then
+		-- 			table.insert( TOKENS, {type="WORD", value=word})
+		-- 		end
+		-- 		text_memory = ""
+		-- 	end
+
+
+		-- 	table.insert( TOKENS, {type = "END_TAG"} )
+
+
+		-- 	local curr_token = #TOKENS
+		-- 	while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do
+		-- 		curr_token = curr_token - 1
+		-- 	end
+		-- 	curr_token = curr_token + 1
+		-- 	if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then
+		-- 		error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
+		-- 	end
+
+		-- 	if TOKENS[curr_token].type == "START_CLOSING_TAG" then
+		-- 		goto continue
+		-- 	end
+
+
+		-- 	local tagname = TOKENS[curr_token+1].value
+
+		-- 	if RAW_TEXT_TAGS[tagname] then
+		-- 		logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.")
+
+		-- 		print(content:sub(1,i-1))
+		-- 		print(("="):rep(40))
+		-- 		print(content:sub(i))
+
+		-- 		local end_tag = content:find("</"..tagname, i, true) - 1
+		-- 		local text_content = content:sub(i+1, end_tag)
+
+		-- 		if tagname == "pre" and false then
+		-- 			-- check if it "looks" like HTML
+		-- 			if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
+		-- 				-- tokenise the inner text
+		-- 				local text_tokens = M.tokenise( text_content )
+
+		-- 				-- and add it to the current token list
+		-- 				for _, tok in ipairs(text_tokens) do
+		-- 					if tok.value == nil then
+		-- 						print( "\t::: " .. tok.type )
+		-- 					else
+		-- 						print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) )
+		-- 					end
+
+		-- 					table.insert( TOKENS, tok )
+		-- 				end
+		-- 			else
+		-- 				-- treat it as text
+		-- 				table.insert( TOKENS, {type="TEXT", value=text_content} )
+		-- 			end
+
+		-- 		end
+
+		-- 		i = end_tag
+		-- 	end
+
+
+		-- 	in_tag = false
+		-- 	goto continue
+		-- end
+
+
+
+		-- if #text_memory ~= 0 then
+		--	if in_tag and currently_opened_quotes == nil then
+		--		local word = trim(text_memory)
+
+		--		if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
+		--			if RAW_TEXT_TAGS[word] then
+		--				logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
+		--				-- made possible because of the whitespace removal at the start
+		--				i = content:find("</"..word, i, true) - 1
+		--			end
+		--		end
+
+		--		if not word:match("^%s*$") then
+		--			table.insert( TOKENS, {type="WORD", value=word})
+		--		end
+		--	else
+		--		table.insert( TOKENS, {type="TEXT", value=text_memory} )
+		--	end
+
+		--	text_memory = ""
+		-- end
+
+		-- in_tag = false
+		-- table.insert( TOKENS, {type = "END_TAG"} )
+
+		-- goto continue
+		-- end
+
+
+
 		if in_tag then
 			if currently_opened_quotes == nil and char:match("%s") then
 				if #text_memory ~= 0 then
 					local word = trim(text_memory)

-					if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
-						if RAW_TEXT_TAGS[word] then
-							logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
-							text_memory = ""
+					-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
+					-- 	if RAW_TEXT_TAGS[word] then
+					-- 		logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
+					-- 		text_memory = ""

-							-- advance to closing ">"
-							i = content:find(">", i, true)
-							-- made possible because of the whitespace removal at the start
-							i = content:find("</"..word, i, true) - 1
-						end
-					end
+					-- 		-- advance to closing ">"
+					-- 		i = content:find(">", i, true)
+					-- 		-- made possible because of the whitespace removal at the start
+					-- 		i = content:find("</"..word, i, true) - 1
+					-- 	end
+					-- end

 					if not word:match("^%s*$") then
 						table.insert( TOKENS, {type="WORD", value=word})
@ -307,14 +577,14 @@ function M.tokenise( content )
 				end
 			end

-			if char == "'" or char == '"' then
-				-- found matching closing quote type
-				if char == currently_opened_quotes then
-					currently_opened_quotes = nil
-				elseif currently_opened_quotes == nil then
-					currently_opened_quotes = char
-				end
-			end
+			-- if char == "'" or char == '"' then
+			--	-- found matching closing quote type
+			--	if char == currently_opened_quotes then
+			--		currently_opened_quotes = nil
+			--	elseif currently_opened_quotes == nil then
+			--		currently_opened_quotes = char
+			--	end
+			-- end

 			text_memory = text_memory .. char
 			goto continue
@ -431,6 +701,16 @@ function M.parse_tokens_into_document( TOKENS )
 			if i > 0 and TOKENS[i-1].type == "START_CLOSING_TAG" then
 				local curr_elem = current_doc_element

+				-- If we find a closing tag, check if:
+				-- - That tag is a void tag (childless, auto-closing)
+				-- - The last child added to the current element is that tag
+				--
+				-- This avoids having <img> tags as parents of <p> tags for example
+				local last_child = curr_elem.children[#curr_elem.children]
+				if last_child and VOID_TAGS[last_child.tag_name] and last_child.tag_name == token.value then
+					goto continue
+				end
+
 				while curr_elem.parent and curr_elem.tag_name ~= token.value do
 					curr_elem = curr_elem.parent
 				end
@ -450,7 +730,7 @@ function M.parse_tokens_into_document( TOKENS )


 			if in_opening_tag_for then
-				local pattern = "([%w-]+)=['\"](.+)['\"]"
+				local pattern = "([^=]+)=['\"](.+)['\"]"

 				local name, raw_value = token.value:match(pattern)

--- a/main.lua
+++ b/main.lua
@ -24,6 +24,9 @@ local CSS = require("css")
 local logger = require("logging")


+local function trim(str)
+	return str:match("^%s*(.-)%s*$")
+end


 local function file_exists(name)
@ -42,6 +45,7 @@ local function print_usage()
 	logger.print("  -1, --first-only: return only the first match")
 	logger.print("  -e, --errors: print warnings")
 	logger.print("  -t, --text: Print only the innerText of the matched elements")
+	logger.print("  -m, --markdown: Print only the innerText of the matched elements, but in a markdown-like syntax")
 	logger.print("  -a, --select-attribute: Print the value of the attribute on matched elements. Supersedes -t.")
 end

@ -60,6 +64,7 @@ local FLAGS = {
 	FIRST_ONLY = {},
 	DO_PRINT_ERRORS = {},
 	INNER_TEXT = {},
+	INNER_MARKDOWN = {},
 	SELECT_ATTRIBUTE = {}
 }

@ -67,6 +72,7 @@ local LONGHAND_FLAGS = {
 	["first-only"] = FLAGS.FIRST_ONLY,
 	["errors"] = FLAGS.DO_PRINT_ERRORS,
 	["text"] = FLAGS.INNER_TEXT,
+	["markdown"] = FLAGS.INNER_MARKDOWN,
 	["select-attribute"] = FLAGS.SELECT_ATTRIBUTE,
 }

@ -74,6 +80,7 @@ local SHORTHAND_FLAGS = {
 	["1"] = FLAGS.FIRST_ONLY,
 	["e"] = FLAGS.DO_PRINT_ERRORS,
 	["t"] = FLAGS.INNER_TEXT,
+	["m"] = FLAGS.INNER_MARKDOWN,
 	["a"] = FLAGS.SELECT_ATTRIBUTE,
 }

@ -353,4 +360,21 @@ if attr then
 	os.exit( RETURN_CODES.OK )
 end

+
+
+local i = 1
+while i <= MAX_NUMBER_OF_ELEMENTS_TO_SHOW do
+	local el = elements[i]
+
+	if flags[FLAGS.INNER_MARKDOWN] then
+		logger.print( trim(el:inner_markdown()) )
+	elseif flags[FLAGS.INNER_TEXT] then
+		logger.print( el:inner_text() )
+	else
+		logger.print( HTML.tostring(el) )
 	end
+
+	i = i+1
+end
+
+os.exit( RETURN_CODES.OK )