Compare commits
5 Commits
a88d21165c
...
79cbc61bb0
Author | SHA1 | Date |
---|---|---|
|
79cbc61bb0 | |
|
4af0f68fa9 | |
|
40c4b464dc | |
|
554daec953 | |
|
f25ecbe0cc |
|
@ -34,9 +34,7 @@ Supported combinators are all the "basic" ones:
|
||||||
|
|
||||||
### TODO
|
### TODO
|
||||||
|
|
||||||
- [ ] `--text` option to only get the text in the matched elements
|
|
||||||
- [ ] Universal selector (`*` to match any element)
|
- [ ] Universal selector (`*` to match any element)
|
||||||
- [ ] Attribute selectors (`[attr="value"]`)
|
|
||||||
- [ ] A way to "group" selectors, e.g. `aside {p, footer}` to select all `p`s and `footer`s in `aside`s ?
|
- [ ] A way to "group" selectors, e.g. `aside {p, footer}` to select all `p`s and `footer`s in `aside`s ?
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
@ -57,6 +55,8 @@ Where:
|
||||||
* `-1`, `--first-only`: Return only the first match
|
* `-1`, `--first-only`: Return only the first match
|
||||||
* `-e`, `--errors`: print warnings
|
* `-e`, `--errors`: print warnings
|
||||||
* `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/docs/Web/API/HTMLElement/innerText) of the matched elements
|
* `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/docs/Web/API/HTMLElement/innerText) of the matched elements
|
||||||
|
* `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/
|
||||||
|
* `-a`, `--select-attribute`: Print the value of the attribute on matched elements. Supersedes -t.
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
|
||||||
|
|
86
html.lua
86
html.lua
|
@ -159,7 +159,7 @@ function M.make_dom_element( tag_name, parent_elem )
|
||||||
end
|
end
|
||||||
|
|
||||||
return text
|
return text
|
||||||
end
|
end,
|
||||||
}
|
}
|
||||||
|
|
||||||
if parent_elem then
|
if parent_elem then
|
||||||
|
@ -207,76 +207,27 @@ function M.tokenise( content )
|
||||||
local currently_opened_quotes = nil
|
local currently_opened_quotes = nil
|
||||||
local text_memory = ""
|
local text_memory = ""
|
||||||
|
|
||||||
local skipping_from = nil
|
|
||||||
local skip_target = nil
|
|
||||||
local skip_mode = "before"
|
|
||||||
|
|
||||||
local function set_skipping_to( str, mode )
|
|
||||||
mode = mode or "before"
|
|
||||||
if mode ~= "before" and mode ~= "after" then
|
|
||||||
error("Unexpected skipping mode: " .. mode .. ", in looking for " .. str)
|
|
||||||
end
|
|
||||||
|
|
||||||
skip_target = str
|
|
||||||
skip_mode = mode
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
local i = 1
|
local i = 1
|
||||||
|
|
||||||
while i <= #content do
|
while i <= #content do
|
||||||
local char = content:sub(i,i)
|
local char = content:sub(i,i)
|
||||||
|
|
||||||
if skip_target ~= nil then
|
|
||||||
if skipping_from == nil then
|
|
||||||
skipping_from = i
|
|
||||||
end
|
|
||||||
|
|
||||||
if skip_mode == "before" then
|
|
||||||
local end_i = i + #skip_target - 1
|
|
||||||
|
|
||||||
if trim(content:sub(i, end_i)) == skip_target then
|
|
||||||
table.insert( TOKENS, {type="TEXT", value=content:sub(skipping_from, i-1)} )
|
|
||||||
|
|
||||||
-- release from skip
|
|
||||||
--i = end_i - 1
|
|
||||||
i = i - 1
|
|
||||||
skip_target = nil
|
|
||||||
skipping_from = nil
|
|
||||||
end
|
|
||||||
|
|
||||||
goto continue
|
|
||||||
else
|
|
||||||
local start_i = i - #skip_target + 1
|
|
||||||
|
|
||||||
if trim(content:sub(start_i, i)) == skip_target then
|
|
||||||
table.insert( TOKENS, {type="TEXT", value=content:sub(skipping_from, start_i-1)} )
|
|
||||||
|
|
||||||
-- release from skip
|
|
||||||
i = start_i
|
|
||||||
skip_target = nil
|
|
||||||
skipping_from = nil
|
|
||||||
end
|
|
||||||
|
|
||||||
goto continue
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if char == "<" then
|
if char == "<" then
|
||||||
if content:sub(i, i+3) == "<!--" then
|
if content:sub(i, i+3) == "<!--" then
|
||||||
set_skipping_to("-->", "after")
|
local end_i = content:find("-->", i+3, true)
|
||||||
|
if end_i then
|
||||||
|
i = end_i + 2
|
||||||
|
else
|
||||||
|
i = #content
|
||||||
|
end
|
||||||
|
|
||||||
goto continue
|
goto continue
|
||||||
end
|
end
|
||||||
|
|
||||||
if content:sub(i, i+1) == "<!" then
|
if content:sub(i, i+1) == "<!" then
|
||||||
i = content:find(">", i)
|
i = content:find(">", i, true)
|
||||||
goto continue
|
goto continue
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -308,7 +259,7 @@ function M.tokenise( content )
|
||||||
if RAW_TEXT_TAGS[word] then
|
if RAW_TEXT_TAGS[word] then
|
||||||
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||||
-- made possible because of the whitespace removal at the start
|
-- made possible because of the whitespace removal at the start
|
||||||
set_skipping_to("</" .. word)
|
i = content:find("</"..word, i, true) - 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -338,12 +289,12 @@ function M.tokenise( content )
|
||||||
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
|
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
|
||||||
if RAW_TEXT_TAGS[word] then
|
if RAW_TEXT_TAGS[word] then
|
||||||
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||||
-- made possible because of the whitespace removal at the start
|
|
||||||
set_skipping_to("</" .. word)
|
|
||||||
text_memory = ""
|
text_memory = ""
|
||||||
|
|
||||||
-- advance to closing ">"
|
-- advance to closing ">"
|
||||||
i = content:find(">", i)
|
i = content:find(">", i, true)
|
||||||
|
-- made possible because of the whitespace removal at the start
|
||||||
|
i = content:find("</"..word, i, true) - 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -356,16 +307,13 @@ function M.tokenise( content )
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
if char == "'" or char == '"' then
|
if char == "'" or char == '"' then
|
||||||
if currently_opened_quotes == char then
|
-- found matching closing quote type
|
||||||
|
if char == currently_opened_quotes then
|
||||||
currently_opened_quotes = nil
|
currently_opened_quotes = nil
|
||||||
else
|
elseif currently_opened_quotes == nil then
|
||||||
currently_opened_quotes = char
|
currently_opened_quotes = char
|
||||||
end
|
end
|
||||||
|
|
||||||
text_memory = text_memory .. char
|
|
||||||
goto continue
|
|
||||||
end
|
end
|
||||||
|
|
||||||
text_memory = text_memory .. char
|
text_memory = text_memory .. char
|
||||||
|
@ -502,7 +450,7 @@ function M.parse_tokens_into_document( TOKENS )
|
||||||
|
|
||||||
|
|
||||||
if in_opening_tag_for then
|
if in_opening_tag_for then
|
||||||
local pattern = "([%w-]+)=['\"](.-)['\"]"
|
local pattern = "([%w-]+)=['\"](.+)['\"]"
|
||||||
|
|
||||||
local name, raw_value = token.value:match(pattern)
|
local name, raw_value = token.value:match(pattern)
|
||||||
|
|
||||||
|
|
3
htmlq.1
3
htmlq.1
|
@ -17,6 +17,9 @@ Print warning messages
|
||||||
.TP
|
.TP
|
||||||
.BR \-t ", " \-\-text
|
.BR \-t ", " \-\-text
|
||||||
Print only the innerText of matched elements
|
Print only the innerText of matched elements
|
||||||
|
.TP
|
||||||
|
.BR \-a ", " \-\-select\-attribute
|
||||||
|
Print the value of the attribute on matched elements. Supersedes -t.
|
||||||
.SH ARGUMENTS
|
.SH ARGUMENTS
|
||||||
.TP
|
.TP
|
||||||
.I html_path_or_minus
|
.I html_path_or_minus
|
||||||
|
|
165
main.lua
165
main.lua
|
@ -42,6 +42,7 @@ local function print_usage()
|
||||||
logger.print(" -1, --first-only: return only the first match")
|
logger.print(" -1, --first-only: return only the first match")
|
||||||
logger.print(" -e, --errors: print warnings")
|
logger.print(" -e, --errors: print warnings")
|
||||||
logger.print(" -t, --text: Print only the innerText of the matched elements")
|
logger.print(" -t, --text: Print only the innerText of the matched elements")
|
||||||
|
logger.print(" -a, --select-attribute: Print the value of the attribute on matched elements. Supersedes -t.")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -59,18 +60,26 @@ local FLAGS = {
|
||||||
FIRST_ONLY = {},
|
FIRST_ONLY = {},
|
||||||
DO_PRINT_ERRORS = {},
|
DO_PRINT_ERRORS = {},
|
||||||
INNER_TEXT = {},
|
INNER_TEXT = {},
|
||||||
|
SELECT_ATTRIBUTE = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
local LONGHAND_FLAGS = {
|
local LONGHAND_FLAGS = {
|
||||||
["first-only"] = FLAGS.FIRST_ONLY,
|
["first-only"] = FLAGS.FIRST_ONLY,
|
||||||
["errors"] = FLAGS.DO_PRINT_ERRORS,
|
["errors"] = FLAGS.DO_PRINT_ERRORS,
|
||||||
["text"] = FLAGS.INNER_TEXT,
|
["text"] = FLAGS.INNER_TEXT,
|
||||||
|
["select-attribute"] = FLAGS.SELECT_ATTRIBUTE,
|
||||||
}
|
}
|
||||||
|
|
||||||
local SHORTHAND_FLAGS = {
|
local SHORTHAND_FLAGS = {
|
||||||
["1"] = FLAGS.FIRST_ONLY,
|
["1"] = FLAGS.FIRST_ONLY,
|
||||||
["e"] = FLAGS.DO_PRINT_ERRORS,
|
["e"] = FLAGS.DO_PRINT_ERRORS,
|
||||||
["t"] = FLAGS.INNER_TEXT,
|
["t"] = FLAGS.INNER_TEXT,
|
||||||
|
["a"] = FLAGS.SELECT_ATTRIBUTE,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
local FLAG_NEEDS_VALUE = {
|
||||||
|
[FLAGS.SELECT_ATTRIBUTE] = true,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,44 +93,103 @@ end
|
||||||
local flags = {}
|
local flags = {}
|
||||||
local positionals = {}
|
local positionals = {}
|
||||||
|
|
||||||
for _, argument in ipairs(arg) do
|
local i = 1
|
||||||
if argument:match("^%-%w+$") then
|
while i <= #arg do
|
||||||
for letter in argument:sub(2):gmatch("(%w)") do
|
local argument = arg[i]
|
||||||
if not SHORTHAND_FLAGS[letter] then
|
|
||||||
logger.printerr("Unknown flag: -"..letter..".")
|
|
||||||
print_usage()
|
|
||||||
os.exit( RETURN_CODES.ARGUMENTS_ERROR )
|
|
||||||
end
|
|
||||||
|
|
||||||
|
-- Handle shorthand flags (-a, -1, etc.)
|
||||||
|
if argument:match("^%-%w+$") then
|
||||||
|
local flag_str = argument:sub(2)
|
||||||
|
|
||||||
|
-- Handle single-letter flags
|
||||||
|
if #flag_str == 1 then
|
||||||
|
local letter = flag_str
|
||||||
local flag = SHORTHAND_FLAGS[letter]
|
local flag = SHORTHAND_FLAGS[letter]
|
||||||
|
|
||||||
if flags[flag] then
|
if not flag then
|
||||||
logger.printerr("Warning: passed -" .. letter .. " flag already !")
|
logger.printerr("Unknown flag: -"..letter)
|
||||||
|
print_usage()
|
||||||
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
-- Handle flags that require values
|
||||||
|
if FLAG_NEEDS_VALUE[flag] then
|
||||||
|
if i == #arg then
|
||||||
|
logger.printerr("Flag -"..letter.." requires a value")
|
||||||
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
||||||
|
end
|
||||||
|
flags[flag] = arg[i+1]
|
||||||
|
i = i + 2 -- Skip next argument as it's the value
|
||||||
|
else
|
||||||
|
-- Handle regular boolean flags
|
||||||
|
if flags[flag] then
|
||||||
|
logger.printerr("Warning: passed -"..letter.." flag already!")
|
||||||
|
end
|
||||||
|
flags[flag] = true
|
||||||
|
i = i + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
else
|
||||||
|
-- Handle grouped flags (-abc)
|
||||||
|
for letter in flag_str:gmatch("(%w)") do
|
||||||
|
local flag = SHORTHAND_FLAGS[letter]
|
||||||
|
|
||||||
|
if not flag then
|
||||||
|
logger.printerr("Unknown flag in group: -"..letter)
|
||||||
|
print_usage()
|
||||||
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
||||||
|
end
|
||||||
|
|
||||||
|
if FLAG_NEEDS_VALUE[flag] then
|
||||||
|
logger.printerr("Cannot use value-taking flags in groups: -"..letter)
|
||||||
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
||||||
|
end
|
||||||
|
|
||||||
|
if flags[flag] then
|
||||||
|
logger.printerr("Warning: passed -"..letter.." flag already!")
|
||||||
|
end
|
||||||
flags[flag] = true
|
flags[flag] = true
|
||||||
end
|
end
|
||||||
elseif argument:match("^%-%-[%w%-]+$") then
|
i = i + 1
|
||||||
local flagname = argument:sub(3)
|
|
||||||
if not LONGHAND_FLAGS[flagname] then
|
|
||||||
logger.printerr("Unknown flag: --"..flagname..".")
|
|
||||||
print_usage()
|
|
||||||
os.exit( RETURN_CODES.ARGUMENTS_ERROR )
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
-- Handle long flags (--flag)
|
||||||
|
elseif argument:match("^%-%-") then
|
||||||
|
local flagname = argument:sub(3)
|
||||||
local flag = LONGHAND_FLAGS[flagname]
|
local flag = LONGHAND_FLAGS[flagname]
|
||||||
|
|
||||||
if flags[flag] then
|
if not flag then
|
||||||
logger.printerr("Warning: passed --" .. flagname .. " flag already !")
|
logger.printerr("Unknown flag: --"..flagname)
|
||||||
|
print_usage()
|
||||||
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
||||||
end
|
end
|
||||||
|
|
||||||
flags[flag] = true
|
-- Handle flags that require values
|
||||||
|
if FLAG_NEEDS_VALUE[flag] then
|
||||||
|
if i == #arg then
|
||||||
|
logger.printerr("Flag --"..flagname.." requires a value")
|
||||||
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
||||||
|
end
|
||||||
|
flags[flag] = arg[i+1]
|
||||||
|
i = i + 2 -- Skip next argument as it's the value
|
||||||
else
|
else
|
||||||
table.insert( positionals, argument )
|
-- Handle regular boolean flags
|
||||||
|
if flags[flag] then
|
||||||
|
logger.printerr("Warning: passed --"..flagname.." flag already!")
|
||||||
|
end
|
||||||
|
flags[flag] = true
|
||||||
|
i = i + 1
|
||||||
|
end
|
||||||
|
|
||||||
|
else
|
||||||
|
-- Handle positional arguments
|
||||||
|
table.insert(positionals, argument)
|
||||||
|
i = i + 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if flags[ FLAGS.DO_PRINT_ERRORS ] then
|
if flags[ FLAGS.DO_PRINT_ERRORS ] then
|
||||||
logger.enable_printing_errors()
|
logger.enable_printing_errors()
|
||||||
end
|
end
|
||||||
|
@ -238,24 +306,51 @@ if #elements == 0 then
|
||||||
os.exit( RETURN_CODES.NOTHING_FOUND )
|
os.exit( RETURN_CODES.NOTHING_FOUND )
|
||||||
end
|
end
|
||||||
|
|
||||||
|
local MAX_NUMBER_OF_ELEMENTS_TO_SHOW = #elements
|
||||||
if flags[FLAGS.FIRST_ONLY] then
|
if flags[FLAGS.FIRST_ONLY] then
|
||||||
if #elements > 0 then
|
MAX_NUMBER_OF_ELEMENTS_TO_SHOW = 1
|
||||||
if flags[FLAGS.INNER_TEXT] then
|
|
||||||
logger.print( elements[1]:inner_text() )
|
|
||||||
os.exit( RETURN_CODES.OK )
|
|
||||||
end
|
|
||||||
|
|
||||||
logger.print( HTML.tostring( elements[1] ) )
|
|
||||||
end
|
|
||||||
|
|
||||||
os.exit( RETURN_CODES.OK )
|
|
||||||
end
|
end
|
||||||
|
|
||||||
for _, el in ipairs(elements) do
|
|
||||||
if flags[FLAGS.INNER_TEXT] then
|
|
||||||
logger.print( el:inner_text() )
|
|
||||||
|
|
||||||
|
local attr = flags[FLAGS.SELECT_ATTRIBUTE]
|
||||||
|
if attr then
|
||||||
|
local spoof_nil = {}
|
||||||
|
local attrs = {}
|
||||||
|
|
||||||
|
local i = 1
|
||||||
|
while i <= MAX_NUMBER_OF_ELEMENTS_TO_SHOW do
|
||||||
|
local el = elements[i]
|
||||||
|
|
||||||
|
local attribute_value = el.attributes[attr]
|
||||||
|
|
||||||
|
table.insert( attrs, attribute_value or spoof_nil )
|
||||||
|
|
||||||
|
i = i+1
|
||||||
|
end
|
||||||
|
|
||||||
|
local nb_non_nil_values = 0
|
||||||
|
for _, val in ipairs(attrs) do
|
||||||
|
if val ~= spoof_nil then
|
||||||
|
nb_non_nil_values = nb_non_nil_values + 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
if nb_non_nil_values == 0 then
|
||||||
|
os.exit( RETURN_CODES.NOTHING_FOUND )
|
||||||
|
end
|
||||||
|
|
||||||
|
for _, val in ipairs(attrs) do
|
||||||
|
if val ~= spoof_nil then
|
||||||
|
print(val)
|
||||||
else
|
else
|
||||||
logger.print( HTML.tostring(el) )
|
print()
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
os.exit( RETURN_CODES.OK )
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue