2025-01-14 19:48:32 +01:00
|
|
|
#!/bin/env lua
|
2025-01-20 22:52:20 +01:00
|
|
|
--
|
|
|
|
-- Copyright (C) 2025 Guilian Celin--Davanture
|
|
|
|
--
|
|
|
|
-- This program is free software: you can redistribute it and/or
|
|
|
|
-- modify it under the terms of the GNU General Public License as
|
|
|
|
-- published by the Free Software Foundation, version 3.
|
|
|
|
--
|
|
|
|
-- This program is distributed in the hope that it will be useful,
|
|
|
|
-- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
-- See the GNU General Public License for more details.
|
|
|
|
--
|
|
|
|
-- You should have received a copy of the GNU General Public License
|
|
|
|
-- along with this program.
|
|
|
|
-- If not, see https://www.gnu.org/licenses/.
|
|
|
|
--
|
|
|
|
|
|
|
|
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-01-20 21:16:24 +01:00
|
|
|
local HTML = require("html")
|
|
|
|
local CSS = require("css")
|
2025-01-14 19:48:32 +01:00
|
|
|
|
2025-01-20 21:16:24 +01:00
|
|
|
local logger = require("logging")
|
2025-01-14 19:48:32 +01:00
|
|
|
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
local function trim(str)
|
|
|
|
return str:match("^%s*(.-)%s*$")
|
|
|
|
end
|
2025-01-19 14:18:43 +01:00
|
|
|
|
|
|
|
|
|
|
|
local function file_exists(name)
|
2025-01-25 17:31:09 +01:00
|
|
|
local f=io.open(name,"r")
|
|
|
|
if f~=nil then io.close(f) return true else return false end
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
local function print_usage()
|
2025-01-25 17:31:09 +01:00
|
|
|
logger.print("Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>")
|
|
|
|
logger.print(" html_path_or_minus: Path to HTML file or '-' for stdin")
|
|
|
|
logger.print(" css_selector: CSS selector to search for")
|
|
|
|
logger.print()
|
|
|
|
logger.print(" Flags:")
|
|
|
|
logger.print(" -1, --first-only: return only the first match")
|
|
|
|
logger.print(" -e, --errors: print warnings")
|
|
|
|
logger.print(" -t, --text: Print only the innerText of the matched elements")
|
2025-02-05 13:50:15 +01:00
|
|
|
logger.print(" -m, --markdown: Print only the innerText of the matched elements, but in a markdown-like syntax")
|
2025-01-25 17:31:09 +01:00
|
|
|
logger.print(" -a, --select-attribute: Print the value of the attribute on matched elements. Supersedes -t.")
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-01-20 22:18:26 +01:00
|
|
|
local RETURN_CODES = {
|
|
|
|
OK = 0,
|
|
|
|
NOTHING_FOUND = 1,
|
|
|
|
ARGUMENTS_ERROR = 2,
|
|
|
|
FAILED_INPUT = 3,
|
|
|
|
}
|
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
|
|
|
|
|
|
|
|
local FLAGS = {
|
|
|
|
FIRST_ONLY = {},
|
2025-01-20 17:17:25 +01:00
|
|
|
DO_PRINT_ERRORS = {},
|
2025-01-20 17:05:04 +01:00
|
|
|
INNER_TEXT = {},
|
2025-02-05 13:50:15 +01:00
|
|
|
INNER_MARKDOWN = {},
|
2025-01-25 17:31:09 +01:00
|
|
|
SELECT_ATTRIBUTE = {}
|
2025-01-19 14:18:43 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
local LONGHAND_FLAGS = {
|
|
|
|
["first-only"] = FLAGS.FIRST_ONLY,
|
2025-01-20 17:17:25 +01:00
|
|
|
["errors"] = FLAGS.DO_PRINT_ERRORS,
|
2025-01-20 17:05:04 +01:00
|
|
|
["text"] = FLAGS.INNER_TEXT,
|
2025-02-05 13:50:15 +01:00
|
|
|
["markdown"] = FLAGS.INNER_MARKDOWN,
|
2025-01-25 17:31:09 +01:00
|
|
|
["select-attribute"] = FLAGS.SELECT_ATTRIBUTE,
|
2025-01-19 14:18:43 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
local SHORTHAND_FLAGS = {
|
2025-01-20 17:03:32 +01:00
|
|
|
["1"] = FLAGS.FIRST_ONLY,
|
2025-01-20 17:17:25 +01:00
|
|
|
["e"] = FLAGS.DO_PRINT_ERRORS,
|
2025-01-20 17:05:04 +01:00
|
|
|
["t"] = FLAGS.INNER_TEXT,
|
2025-02-05 13:50:15 +01:00
|
|
|
["m"] = FLAGS.INNER_MARKDOWN,
|
2025-01-25 17:31:09 +01:00
|
|
|
["a"] = FLAGS.SELECT_ATTRIBUTE,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
local FLAG_NEEDS_VALUE = {
|
|
|
|
[FLAGS.SELECT_ATTRIBUTE] = true,
|
2025-01-19 14:18:43 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if #arg < 2 then
|
|
|
|
logger.printerr("Error: Not enough arguments")
|
|
|
|
print_usage()
|
2025-01-20 22:18:26 +01:00
|
|
|
os.exit( RETURN_CODES.ARGUMENTS_ERROR )
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
local flags = {}
|
|
|
|
local positionals = {}
|
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
local i = 1
|
|
|
|
while i <= #arg do
|
|
|
|
local argument = arg[i]
|
|
|
|
|
|
|
|
-- Handle shorthand flags (-a, -1, etc.)
|
2025-01-19 14:18:43 +01:00
|
|
|
if argument:match("^%-%w+$") then
|
2025-01-25 17:31:09 +01:00
|
|
|
local flag_str = argument:sub(2)
|
2025-01-19 14:18:43 +01:00
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
-- Handle single-letter flags
|
|
|
|
if #flag_str == 1 then
|
|
|
|
local letter = flag_str
|
2025-01-19 14:18:43 +01:00
|
|
|
local flag = SHORTHAND_FLAGS[letter]
|
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
if not flag then
|
|
|
|
logger.printerr("Unknown flag: -"..letter)
|
|
|
|
print_usage()
|
|
|
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
-- Handle flags that require values
|
|
|
|
if FLAG_NEEDS_VALUE[flag] then
|
|
|
|
if i == #arg then
|
|
|
|
logger.printerr("Flag -"..letter.." requires a value")
|
|
|
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
|
|
|
end
|
|
|
|
flags[flag] = arg[i+1]
|
|
|
|
i = i + 2 -- Skip next argument as it's the value
|
|
|
|
else
|
|
|
|
-- Handle regular boolean flags
|
|
|
|
if flags[flag] then
|
|
|
|
logger.printerr("Warning: passed -"..letter.." flag already!")
|
|
|
|
end
|
|
|
|
flags[flag] = true
|
|
|
|
i = i + 1
|
|
|
|
end
|
|
|
|
|
|
|
|
else
|
|
|
|
-- Handle grouped flags (-abc)
|
|
|
|
for letter in flag_str:gmatch("(%w)") do
|
|
|
|
local flag = SHORTHAND_FLAGS[letter]
|
|
|
|
|
|
|
|
if not flag then
|
|
|
|
logger.printerr("Unknown flag in group: -"..letter)
|
|
|
|
print_usage()
|
|
|
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
|
|
|
end
|
|
|
|
|
|
|
|
if FLAG_NEEDS_VALUE[flag] then
|
|
|
|
logger.printerr("Cannot use value-taking flags in groups: -"..letter)
|
|
|
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
|
|
|
end
|
|
|
|
|
|
|
|
if flags[flag] then
|
|
|
|
logger.printerr("Warning: passed -"..letter.." flag already!")
|
|
|
|
end
|
|
|
|
flags[flag] = true
|
|
|
|
end
|
|
|
|
i = i + 1
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
2025-01-25 17:31:09 +01:00
|
|
|
|
|
|
|
-- Handle long flags (--flag)
|
|
|
|
elseif argument:match("^%-%-") then
|
2025-01-19 14:18:43 +01:00
|
|
|
local flagname = argument:sub(3)
|
2025-01-25 17:31:09 +01:00
|
|
|
local flag = LONGHAND_FLAGS[flagname]
|
|
|
|
|
|
|
|
if not flag then
|
|
|
|
logger.printerr("Unknown flag: --"..flagname)
|
2025-01-19 14:18:43 +01:00
|
|
|
print_usage()
|
2025-01-25 17:31:09 +01:00
|
|
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
-- Handle flags that require values
|
|
|
|
if FLAG_NEEDS_VALUE[flag] then
|
|
|
|
if i == #arg then
|
|
|
|
logger.printerr("Flag --"..flagname.." requires a value")
|
|
|
|
os.exit(RETURN_CODES.ARGUMENTS_ERROR)
|
|
|
|
end
|
|
|
|
flags[flag] = arg[i+1]
|
|
|
|
i = i + 2 -- Skip next argument as it's the value
|
|
|
|
else
|
|
|
|
-- Handle regular boolean flags
|
|
|
|
if flags[flag] then
|
|
|
|
logger.printerr("Warning: passed --"..flagname.." flag already!")
|
|
|
|
end
|
|
|
|
flags[flag] = true
|
|
|
|
i = i + 1
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
else
|
2025-01-25 17:31:09 +01:00
|
|
|
-- Handle positional arguments
|
|
|
|
table.insert(positionals, argument)
|
|
|
|
i = i + 1
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
2025-01-14 19:48:32 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
|
2025-01-20 17:17:25 +01:00
|
|
|
if flags[ FLAGS.DO_PRINT_ERRORS ] then
|
2025-01-19 14:18:43 +01:00
|
|
|
logger.enable_printing_errors()
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
if #positionals > 2 then
|
|
|
|
logger.printerr("Error: too many arguments !")
|
|
|
|
print_usage()
|
2025-01-20 22:18:26 +01:00
|
|
|
os.exit( RETURN_CODES.ARGUMENTS_ERROR )
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
2025-01-18 18:54:10 +01:00
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
local html_file = positionals[1]
|
|
|
|
local html = nil
|
2025-01-18 18:54:10 +01:00
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
if html_file ~= "-" then
|
|
|
|
if not( file_exists( html_file )) then
|
|
|
|
logger.printerr("File doesn't exist: " .. html_file)
|
2025-01-20 22:18:26 +01:00
|
|
|
os.exit( RETURN_CODES.FAILED_INPUT )
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
local handle = io.open( html_file, "r" )
|
|
|
|
if not handle then
|
|
|
|
logger.printerr("Failed to open file " .. html_file)
|
2025-01-20 22:18:26 +01:00
|
|
|
os.exit( RETURN_CODES.FAILED_INPUT )
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
html = handle:read("a")
|
|
|
|
else
|
2025-01-20 17:05:44 +01:00
|
|
|
html = io.read("a")
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
local document = HTML.parse( html )
|
|
|
|
local css_selector = CSS.parse( positionals[2] )
|
|
|
|
|
|
|
|
|
|
|
|
local current_selector = css_selector
|
2025-01-18 18:54:10 +01:00
|
|
|
|
|
|
|
|
|
|
|
local elements = {}
|
|
|
|
-- start with all elements matching the first selector
|
2025-01-19 14:18:43 +01:00
|
|
|
document:foreach(function( el )
|
2025-01-18 18:54:10 +01:00
|
|
|
if el:check_simple_selector( current_selector.selector ) then
|
|
|
|
table.insert( elements, el )
|
|
|
|
end
|
|
|
|
end)
|
|
|
|
|
|
|
|
while current_selector.combinator ~= nil do
|
|
|
|
local next_selector = current_selector.next
|
|
|
|
|
|
|
|
local new_elements = {}
|
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
if current_selector.combinator == CSS.COMBINATORS.DESCENDANT then
|
2025-01-18 18:54:10 +01:00
|
|
|
for _, element in ipairs( elements ) do
|
|
|
|
element:foreach(function( el )
|
|
|
|
if el:check_simple_selector( next_selector.selector ) then
|
|
|
|
table.insert( new_elements, el )
|
|
|
|
end
|
|
|
|
end)
|
|
|
|
end
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
if current_selector.combinator == CSS.COMBINATORS.DIRECT_DESCENDANT then
|
2025-01-18 18:54:10 +01:00
|
|
|
for _, element in ipairs( elements ) do
|
|
|
|
for _, child in ipairs( element.children ) do
|
|
|
|
if child:check_simple_selector( next_selector.selector ) then
|
|
|
|
table.insert( new_elements, child )
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
if current_selector.combinator == CSS.COMBINATORS.NEXT_SIBLING then
|
2025-01-18 18:54:10 +01:00
|
|
|
for _, element in ipairs( elements ) do
|
|
|
|
local next_sibling = element:get_next_sibling()
|
|
|
|
while next_sibling and next_sibling.tag_name == ":text" do
|
2025-01-25 17:31:09 +01:00
|
|
|
next_sibling = next_sibling:get_next_sibling()
|
|
|
|
end
|
2025-01-18 18:54:10 +01:00
|
|
|
|
|
|
|
if next_sibling and next_sibling:check_simple_selector( next_selector.selector ) then
|
|
|
|
table.insert( new_elements, next_sibling )
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
2025-01-19 14:18:43 +01:00
|
|
|
if current_selector.combinator == CSS.COMBINATORS.SUBSEQUENT_SIBLING then
|
2025-01-18 18:54:10 +01:00
|
|
|
for _, element in ipairs( elements ) do
|
|
|
|
local sibling = element:get_next_sibling()
|
|
|
|
while sibling ~= nil do
|
|
|
|
if sibling:check_simple_selector( next_selector.selector ) then
|
|
|
|
table.insert( new_elements, sibling )
|
|
|
|
end
|
|
|
|
|
|
|
|
sibling = sibling:get_next_sibling()
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
goto continue
|
|
|
|
end
|
|
|
|
|
|
|
|
::continue::
|
|
|
|
elements = new_elements
|
|
|
|
current_selector = next_selector
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-01-20 22:18:26 +01:00
|
|
|
if #elements == 0 then
|
|
|
|
os.exit( RETURN_CODES.NOTHING_FOUND )
|
|
|
|
end
|
2025-01-18 18:54:10 +01:00
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
local MAX_NUMBER_OF_ELEMENTS_TO_SHOW = #elements
|
2025-01-19 14:18:43 +01:00
|
|
|
if flags[FLAGS.FIRST_ONLY] then
|
2025-01-25 17:31:09 +01:00
|
|
|
MAX_NUMBER_OF_ELEMENTS_TO_SHOW = 1
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
local attr = flags[FLAGS.SELECT_ATTRIBUTE]
|
|
|
|
if attr then
|
|
|
|
local spoof_nil = {}
|
|
|
|
local attrs = {}
|
|
|
|
|
|
|
|
local i = 1
|
|
|
|
while i <= MAX_NUMBER_OF_ELEMENTS_TO_SHOW do
|
|
|
|
local el = elements[i]
|
|
|
|
|
|
|
|
local attribute_value = el.attributes[attr]
|
|
|
|
|
|
|
|
table.insert( attrs, attribute_value or spoof_nil )
|
|
|
|
|
|
|
|
i = i+1
|
|
|
|
end
|
|
|
|
|
|
|
|
local nb_non_nil_values = 0
|
|
|
|
for _, val in ipairs(attrs) do
|
|
|
|
if val ~= spoof_nil then
|
|
|
|
nb_non_nil_values = nb_non_nil_values + 1
|
2025-01-20 17:05:04 +01:00
|
|
|
end
|
2025-01-25 17:31:09 +01:00
|
|
|
end
|
2025-01-20 17:05:04 +01:00
|
|
|
|
2025-01-25 17:31:09 +01:00
|
|
|
if nb_non_nil_values == 0 then
|
|
|
|
os.exit( RETURN_CODES.NOTHING_FOUND )
|
|
|
|
end
|
|
|
|
|
|
|
|
for _, val in ipairs(attrs) do
|
|
|
|
if val ~= spoof_nil then
|
|
|
|
print(val)
|
|
|
|
else
|
|
|
|
print()
|
|
|
|
end
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
2025-01-18 18:54:10 +01:00
|
|
|
|
2025-01-20 22:18:26 +01:00
|
|
|
os.exit( RETURN_CODES.OK )
|
2025-01-18 18:54:10 +01:00
|
|
|
end
|
|
|
|
|
2025-02-05 13:50:15 +01:00
|
|
|
|
|
|
|
|
|
|
|
local i = 1
|
|
|
|
while i <= MAX_NUMBER_OF_ELEMENTS_TO_SHOW do
|
|
|
|
local el = elements[i]
|
|
|
|
|
|
|
|
if flags[FLAGS.INNER_MARKDOWN] then
|
|
|
|
logger.print( trim(el:inner_markdown()) )
|
|
|
|
elseif flags[FLAGS.INNER_TEXT] then
|
|
|
|
logger.print( el:inner_text() )
|
|
|
|
else
|
|
|
|
logger.print( HTML.tostring(el) )
|
|
|
|
end
|
|
|
|
|
|
|
|
i = i+1
|
2025-01-19 14:18:43 +01:00
|
|
|
end
|
2025-02-05 13:50:15 +01:00
|
|
|
|
|
|
|
os.exit( RETURN_CODES.OK )
|