feat: add --markdown mode, + minor fixes
parent
59585919fb
commit
5e8c023559
356
html.lua
356
html.lua
|
@ -95,6 +95,9 @@ local INLINE_TAGS = {
|
|||
template = true,
|
||||
}
|
||||
|
||||
local function normalize_whitespace(str)
|
||||
return str:gsub("%s+", " ")
|
||||
end
|
||||
|
||||
function M.make_dom_element( tag_name, parent_elem )
|
||||
local o = {
|
||||
|
@ -160,6 +163,90 @@ function M.make_dom_element( tag_name, parent_elem )
|
|||
|
||||
return text
|
||||
end,
|
||||
|
||||
inner_markdown = function(self, in_pre, root_call)
|
||||
in_pre = in_pre or false
|
||||
root_call = root_call or true
|
||||
|
||||
if self.tag_name == "script" or self.tag_name == "style" then
|
||||
return ""
|
||||
end
|
||||
|
||||
|
||||
if self.tag_name == ":text" then
|
||||
return self.content
|
||||
end
|
||||
|
||||
local text = ""
|
||||
local is_list_item = self.tag_name == "li"
|
||||
local parent_is_ul = self.parent and self.parent.tag_name == "ul"
|
||||
local parent_is_ol = self.parent and self.parent.tag_name == "ol"
|
||||
|
||||
local is_heading = self.tag_name:match("^h[1-6]$")
|
||||
local is_pre = self.tag_name == "pre"
|
||||
|
||||
if is_heading then
|
||||
local level = tonumber(self.tag_name:sub(2))
|
||||
text = "\n" .. string.rep("#", level) .. " "
|
||||
end
|
||||
|
||||
if is_list_item then
|
||||
if parent_is_ul then
|
||||
text = "* "
|
||||
elseif parent_is_ol then
|
||||
local position = self:get_child_index()
|
||||
text = position .. ". "
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
-- Process children
|
||||
local inner = ""
|
||||
for _, child in ipairs(self.children) do
|
||||
inner = inner .. child:inner_markdown(false, in_pre or is_pre or false)
|
||||
end
|
||||
|
||||
if self.tag_name == "br" then
|
||||
text = text .. "\n" .. inner
|
||||
elseif is_pre then
|
||||
text = text .. inner
|
||||
elseif is_heading then
|
||||
text = text .. normalize_whitespace(inner)
|
||||
elseif self.tag_name == "strong" then
|
||||
text = text .. "**" .. normalize_whitespace(inner) .. "**"
|
||||
elseif self.tag_name == "em" then
|
||||
text = text .. "_" .. normalize_whitespace(inner) .. "_"
|
||||
elseif self.tag_name == "code" then
|
||||
local is_block = self.parent and self.parent.tag_name == "pre"
|
||||
if is_block then
|
||||
text = text .. "\n```\n" .. inner .. "\n```\n"
|
||||
else
|
||||
text = text .. "`" .. normalize_whitespace(inner) .. "`"
|
||||
end
|
||||
elseif self.tag_name == "a" then
|
||||
text = text .. "[" .. normalize_whitespace(inner) .. "]"
|
||||
|
||||
if self.attributes.href then
|
||||
text = text .. "(" .. self.attributes.href .. ")"
|
||||
end
|
||||
else
|
||||
text = text .. inner
|
||||
end
|
||||
|
||||
-- Add newlines after block elements
|
||||
if not INLINE_TAGS[self.tag_name] then
|
||||
text = text .. "\n"
|
||||
end
|
||||
|
||||
if root_call then
|
||||
-- Step 1: Remove whitespace between newlines
|
||||
text = text:gsub("(\n)%s+(\n)", "%1%2")
|
||||
-- Step 2: Replace 3+ consecutive newlines with just two
|
||||
text = text:gsub("\n\n\n+", "\n\n")
|
||||
end
|
||||
|
||||
return text
|
||||
end
|
||||
}
|
||||
|
||||
if parent_elem then
|
||||
|
@ -203,7 +290,7 @@ function M.tokenise( content )
|
|||
local TOKENS = {}
|
||||
|
||||
-- state
|
||||
local in_tag = false
|
||||
local in_tag = nil
|
||||
local currently_opened_quotes = nil
|
||||
local text_memory = ""
|
||||
|
||||
|
@ -213,6 +300,35 @@ function M.tokenise( content )
|
|||
local char = content:sub(i,i)
|
||||
|
||||
|
||||
--
|
||||
-- Taking care of quotes
|
||||
--
|
||||
if in_tag then
|
||||
-- finding matching quotes
|
||||
if currently_opened_quotes ~= nil and char == currently_opened_quotes then
|
||||
currently_opened_quotes = nil
|
||||
text_memory = text_memory .. char
|
||||
goto continue
|
||||
end
|
||||
|
||||
-- Opening a new set of quotes
|
||||
if currently_opened_quotes == nil and (char == "'" or char == '"') then
|
||||
currently_opened_quotes = char
|
||||
text_memory = text_memory .. char
|
||||
goto continue
|
||||
end
|
||||
|
||||
-- reaching here means:
|
||||
-- - we're in a tag, inside quotes
|
||||
-- - the character is not the closing quote mark
|
||||
-- So just add it and get on with it.
|
||||
if currently_opened_quotes ~= nil then
|
||||
text_memory = text_memory .. char
|
||||
goto continue
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
if char == "<" then
|
||||
if content:sub(i, i+3) == "<!--" then
|
||||
|
@ -237,66 +353,220 @@ function M.tokenise( content )
|
|||
text_memory = ""
|
||||
end
|
||||
|
||||
in_tag = true
|
||||
|
||||
-- closing tag
|
||||
if content:sub(i, i+1) == "</" then
|
||||
table.insert( TOKENS, {type="START_CLOSING_TAG"} )
|
||||
in_tag = "closing"
|
||||
i = i+1
|
||||
goto continue
|
||||
end
|
||||
|
||||
table.insert( TOKENS, {type="START_OPENING_TAG"} )
|
||||
in_tag = "opening"
|
||||
goto continue
|
||||
end
|
||||
|
||||
if char == ">" then
|
||||
|
||||
if char == ">" and in_tag then
|
||||
-- first, cleanup the text_memory, as the closing > is often side-by-side with the last "word"
|
||||
if #text_memory ~= 0 then
|
||||
if in_tag and currently_opened_quotes == nil then
|
||||
local word = trim(text_memory)
|
||||
|
||||
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
|
||||
if RAW_TEXT_TAGS[word] then
|
||||
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
-- made possible because of the whitespace removal at the start
|
||||
i = content:find("</"..word, i, true) - 1
|
||||
end
|
||||
end
|
||||
|
||||
if not word:match("^%s*$") then
|
||||
table.insert( TOKENS, {type="WORD", value=word})
|
||||
end
|
||||
else
|
||||
table.insert( TOKENS, {type="TEXT", value=text_memory} )
|
||||
end
|
||||
|
||||
text_memory = ""
|
||||
end
|
||||
|
||||
in_tag = false
|
||||
table.insert( TOKENS, {type = "END_TAG"} )
|
||||
|
||||
-- closing tags don't require any more work.
|
||||
if in_tag == "closing" then
|
||||
in_tag = nil
|
||||
goto continue
|
||||
end
|
||||
in_tag = nil
|
||||
|
||||
local curr_token = #TOKENS
|
||||
while curr_token > 0 and TOKENS[curr_token].type ~= "START_OPENING_TAG" do
|
||||
curr_token = curr_token - 1
|
||||
end
|
||||
curr_token = curr_token + 1
|
||||
|
||||
if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" then
|
||||
error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
|
||||
end
|
||||
|
||||
local tagname = TOKENS[curr_token].value
|
||||
|
||||
if RAW_TEXT_TAGS[tagname] then
|
||||
local end_tag = (content:find("</"..tagname, i, true) or 0) - 1
|
||||
if end_tag < 1 then
|
||||
logger.printerr("Can't find closing " .. tagname .. "!")
|
||||
print(content:sub(i))
|
||||
os.exit(-5)
|
||||
end
|
||||
local text_content = content:sub(i+1, end_tag)
|
||||
|
||||
-- special handling of pre
|
||||
if tagname == "pre" then
|
||||
-- check if it "looks" like HTML
|
||||
if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
|
||||
-- tokenise the inner text
|
||||
local text_tokens = M.tokenise( text_content )
|
||||
|
||||
-- and add it to the current token list
|
||||
for _, tok in ipairs(text_tokens) do
|
||||
table.insert( TOKENS, tok )
|
||||
end
|
||||
|
||||
i = end_tag
|
||||
goto continue
|
||||
end
|
||||
end
|
||||
-- treat the rest as text
|
||||
|
||||
i = end_tag
|
||||
table.insert( TOKENS, {type="TEXT", value=text_content} )
|
||||
goto continue
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
goto continue
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
----------------------------------------------------
|
||||
--- "OLD", UNCHECKED CODE
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
-- if char == ">" and in_tag and currently_opened_quotes == nil then
|
||||
-- if #text_memory ~= 0 then
|
||||
-- local word = trim(text_memory)
|
||||
-- if not word:match("^%s*$") then
|
||||
-- table.insert( TOKENS, {type="WORD", value=word})
|
||||
-- end
|
||||
-- text_memory = ""
|
||||
-- end
|
||||
|
||||
|
||||
-- table.insert( TOKENS, {type = "END_TAG"} )
|
||||
|
||||
|
||||
-- local curr_token = #TOKENS
|
||||
-- while curr_token > 0 and (TOKENS[curr_token].type ~= "START_OPENING_TAG" or TOKENS[curr_token].type ~= "START_CLOSING_TAG") do
|
||||
-- curr_token = curr_token - 1
|
||||
-- end
|
||||
-- curr_token = curr_token + 1
|
||||
-- if curr_token == 1 and TOKENS[curr_token].type ~= "START_OPENING_TAG" and TOKENS[curr_token].type ~= "START_CLOSING_TAG" then
|
||||
-- error("Error: Reached start of token stream while winding back to find tag name; Not supposed to be possible.")
|
||||
-- end
|
||||
|
||||
-- if TOKENS[curr_token].type == "START_CLOSING_TAG" then
|
||||
-- goto continue
|
||||
-- end
|
||||
|
||||
|
||||
-- local tagname = TOKENS[curr_token+1].value
|
||||
|
||||
-- if RAW_TEXT_TAGS[tagname] then
|
||||
-- logger.printerr("Warning: "..tagname.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
|
||||
-- print(content:sub(1,i-1))
|
||||
-- print(("="):rep(40))
|
||||
-- print(content:sub(i))
|
||||
|
||||
-- local end_tag = content:find("</"..tagname, i, true) - 1
|
||||
-- local text_content = content:sub(i+1, end_tag)
|
||||
|
||||
-- if tagname == "pre" and false then
|
||||
-- -- check if it "looks" like HTML
|
||||
-- if text_content:find("<", 1, true) and text_content:find(">", 1, true) then
|
||||
-- -- tokenise the inner text
|
||||
-- local text_tokens = M.tokenise( text_content )
|
||||
|
||||
-- -- and add it to the current token list
|
||||
-- for _, tok in ipairs(text_tokens) do
|
||||
-- if tok.value == nil then
|
||||
-- print( "\t::: " .. tok.type )
|
||||
-- else
|
||||
-- print( "\t::: " .. tok.type .. ": " .. tostring(tok.value) )
|
||||
-- end
|
||||
|
||||
-- table.insert( TOKENS, tok )
|
||||
-- end
|
||||
-- else
|
||||
-- -- treat it as text
|
||||
-- table.insert( TOKENS, {type="TEXT", value=text_content} )
|
||||
-- end
|
||||
|
||||
-- end
|
||||
|
||||
-- i = end_tag
|
||||
-- end
|
||||
|
||||
|
||||
-- in_tag = false
|
||||
-- goto continue
|
||||
-- end
|
||||
|
||||
|
||||
|
||||
-- if #text_memory ~= 0 then
|
||||
-- if in_tag and currently_opened_quotes == nil then
|
||||
-- local word = trim(text_memory)
|
||||
|
||||
-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
|
||||
-- if RAW_TEXT_TAGS[word] then
|
||||
-- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
-- -- made possible because of the whitespace removal at the start
|
||||
-- i = content:find("</"..word, i, true) - 1
|
||||
-- end
|
||||
-- end
|
||||
|
||||
-- if not word:match("^%s*$") then
|
||||
-- table.insert( TOKENS, {type="WORD", value=word})
|
||||
-- end
|
||||
-- else
|
||||
-- table.insert( TOKENS, {type="TEXT", value=text_memory} )
|
||||
-- end
|
||||
|
||||
-- text_memory = ""
|
||||
-- end
|
||||
|
||||
-- in_tag = false
|
||||
-- table.insert( TOKENS, {type = "END_TAG"} )
|
||||
|
||||
-- goto continue
|
||||
-- end
|
||||
|
||||
|
||||
|
||||
if in_tag then
|
||||
if currently_opened_quotes == nil and char:match("%s") then
|
||||
if #text_memory ~= 0 then
|
||||
local word = trim(text_memory)
|
||||
|
||||
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
|
||||
if RAW_TEXT_TAGS[word] then
|
||||
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
text_memory = ""
|
||||
-- if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
|
||||
-- if RAW_TEXT_TAGS[word] then
|
||||
-- logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
-- text_memory = ""
|
||||
|
||||
-- advance to closing ">"
|
||||
i = content:find(">", i, true)
|
||||
-- made possible because of the whitespace removal at the start
|
||||
i = content:find("</"..word, i, true) - 1
|
||||
end
|
||||
end
|
||||
-- -- advance to closing ">"
|
||||
-- i = content:find(">", i, true)
|
||||
-- -- made possible because of the whitespace removal at the start
|
||||
-- i = content:find("</"..word, i, true) - 1
|
||||
-- end
|
||||
-- end
|
||||
|
||||
if not word:match("^%s*$") then
|
||||
table.insert( TOKENS, {type="WORD", value=word})
|
||||
|
@ -307,14 +577,14 @@ function M.tokenise( content )
|
|||
end
|
||||
end
|
||||
|
||||
if char == "'" or char == '"' then
|
||||
-- found matching closing quote type
|
||||
if char == currently_opened_quotes then
|
||||
currently_opened_quotes = nil
|
||||
elseif currently_opened_quotes == nil then
|
||||
currently_opened_quotes = char
|
||||
end
|
||||
end
|
||||
-- if char == "'" or char == '"' then
|
||||
-- -- found matching closing quote type
|
||||
-- if char == currently_opened_quotes then
|
||||
-- currently_opened_quotes = nil
|
||||
-- elseif currently_opened_quotes == nil then
|
||||
-- currently_opened_quotes = char
|
||||
-- end
|
||||
-- end
|
||||
|
||||
text_memory = text_memory .. char
|
||||
goto continue
|
||||
|
@ -431,6 +701,16 @@ function M.parse_tokens_into_document( TOKENS )
|
|||
if i > 0 and TOKENS[i-1].type == "START_CLOSING_TAG" then
|
||||
local curr_elem = current_doc_element
|
||||
|
||||
-- If we find a closing tag, check if:
|
||||
-- - That tag is a void tag (childless, auto-closing)
|
||||
-- - The last child added to the current element is that tag
|
||||
--
|
||||
-- This avoids having <img> tags as parents of <p> tags for example
|
||||
local last_child = curr_elem.children[#curr_elem.children]
|
||||
if last_child and VOID_TAGS[last_child.tag_name] and last_child.tag_name == token.value then
|
||||
goto continue
|
||||
end
|
||||
|
||||
while curr_elem.parent and curr_elem.tag_name ~= token.value do
|
||||
curr_elem = curr_elem.parent
|
||||
end
|
||||
|
@ -450,7 +730,7 @@ function M.parse_tokens_into_document( TOKENS )
|
|||
|
||||
|
||||
if in_opening_tag_for then
|
||||
local pattern = "([%w-]+)=['\"](.+)['\"]"
|
||||
local pattern = "([^=]+)=['\"](.+)['\"]"
|
||||
|
||||
local name, raw_value = token.value:match(pattern)
|
||||
|
||||
|
|
24
main.lua
24
main.lua
|
@ -24,6 +24,9 @@ local CSS = require("css")
|
|||
local logger = require("logging")
|
||||
|
||||
|
||||
local function trim(str)
|
||||
return str:match("^%s*(.-)%s*$")
|
||||
end
|
||||
|
||||
|
||||
local function file_exists(name)
|
||||
|
@ -42,6 +45,7 @@ local function print_usage()
|
|||
logger.print(" -1, --first-only: return only the first match")
|
||||
logger.print(" -e, --errors: print warnings")
|
||||
logger.print(" -t, --text: Print only the innerText of the matched elements")
|
||||
logger.print(" -m, --markdown: Print only the innerText of the matched elements, but in a markdown-like syntax")
|
||||
logger.print(" -a, --select-attribute: Print the value of the attribute on matched elements. Supersedes -t.")
|
||||
end
|
||||
|
||||
|
@ -60,6 +64,7 @@ local FLAGS = {
|
|||
FIRST_ONLY = {},
|
||||
DO_PRINT_ERRORS = {},
|
||||
INNER_TEXT = {},
|
||||
INNER_MARKDOWN = {},
|
||||
SELECT_ATTRIBUTE = {}
|
||||
}
|
||||
|
||||
|
@ -67,6 +72,7 @@ local LONGHAND_FLAGS = {
|
|||
["first-only"] = FLAGS.FIRST_ONLY,
|
||||
["errors"] = FLAGS.DO_PRINT_ERRORS,
|
||||
["text"] = FLAGS.INNER_TEXT,
|
||||
["markdown"] = FLAGS.INNER_MARKDOWN,
|
||||
["select-attribute"] = FLAGS.SELECT_ATTRIBUTE,
|
||||
}
|
||||
|
||||
|
@ -74,6 +80,7 @@ local SHORTHAND_FLAGS = {
|
|||
["1"] = FLAGS.FIRST_ONLY,
|
||||
["e"] = FLAGS.DO_PRINT_ERRORS,
|
||||
["t"] = FLAGS.INNER_TEXT,
|
||||
["m"] = FLAGS.INNER_MARKDOWN,
|
||||
["a"] = FLAGS.SELECT_ATTRIBUTE,
|
||||
}
|
||||
|
||||
|
@ -353,4 +360,21 @@ if attr then
|
|||
os.exit( RETURN_CODES.OK )
|
||||
end
|
||||
|
||||
|
||||
|
||||
local i = 1
|
||||
while i <= MAX_NUMBER_OF_ELEMENTS_TO_SHOW do
|
||||
local el = elements[i]
|
||||
|
||||
if flags[FLAGS.INNER_MARKDOWN] then
|
||||
logger.print( trim(el:inner_markdown()) )
|
||||
elseif flags[FLAGS.INNER_TEXT] then
|
||||
logger.print( el:inner_text() )
|
||||
else
|
||||
logger.print( HTML.tostring(el) )
|
||||
end
|
||||
|
||||
i = i+1
|
||||
end
|
||||
|
||||
os.exit( RETURN_CODES.OK )
|
||||
|
|
Loading…
Reference in New Issue