Compare commits

...

8 Commits

Author SHA1 Message Date
Guilian b94a21cb2e
chore: --text usage 2025-01-20 17:20:28 +01:00
Guilian 1589f2f395
chore: indent 2025-01-20 17:18:02 +01:00
Guilian 19a8896978
feat: invert error printing flag
Just way too much output on standard webpages
2025-01-20 17:17:25 +01:00
Guilian 9fd8b3e15d
chore: compile to `htmlq` 2025-01-20 17:16:56 +01:00
Guilian 94f7694c1e
fix: read stdin properly 2025-01-20 17:05:44 +01:00
Guilian 79d6a8f77d
feat: attribute selection 2025-01-20 17:05:29 +01:00
Guilian c923159d7a
feat: get element inner text (--text option) 2025-01-20 17:05:04 +01:00
Guilian bd2b04216c
refactor: change -f to -1 to get one result 2025-01-20 17:03:32 +01:00
5 changed files with 197 additions and 73 deletions

2
.gitignore vendored
View File

@ -2,3 +2,5 @@
*.luastatic.c *.luastatic.c
# Compiled executable from main.lua # Compiled executable from main.lua
main main
# Same but with correct name
htmlq

View File

@ -41,15 +41,22 @@ Supported combinators are all the "basic" ones:
## Usage ## Usage
``` Once compiled, you can run Htmlq using the following command:
Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>
html_path_or_minus: Path to HTML file or '-' for stdin
css_selector: CSS selector to search for
Flags:
-f, --first-only: return only the first match
-q, --quiet: Don't print warnings
``` ```
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
```
Where:
* `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
* `<css_selector>` is the CSS selector you want to use to query the HTML.
### Flags
* `-1`, `--first-only`: Return only the first match
* `-q`, `--quiet`: Don't print warnings
* `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/docs/Web/API/HTMLElement/innerText) of the matched elements
## Motivation ## Motivation
@ -78,25 +85,7 @@ luarocks install luastatic
Once `luastatic` is installed, you can compile Htmlq by running the following command in your terminal, from the project's root directory: Once `luastatic` is installed, you can compile Htmlq by running the following command in your terminal, from the project's root directory:
``` ```
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so -o htmlq
``` ```
Note that all `.lua` files from the project need to be specified, with `main.lua` as the first one. Also, the path to `liblua` may vary according to your system. The example provided is for an installation on EndeavourOS. Note that all `.lua` files from the project need to be specified, with `main.lua` as the first one. Also, the path to `liblua` may vary according to your system. The example provided is for an installation on EndeavourOS.
## Running
Once compiled, you can run Htmlq using the following command:
```
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
```
Where:
* `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
* `<css_selector>` is the CSS selector you want to use to query the HTML.
### Flags
* `-f`, `--first-only`: Return only the first match
* `-q`, `--quiet`: Don't print warnings

34
css.lua
View File

@ -66,7 +66,8 @@ local function parse_compound_selector( tokeniser )
tag_name = nil, tag_name = nil,
id = nil, id = nil,
class = {}, class = {},
attributes = {}, attributes_values = {},
attributes_present = {},
} }
--local selectors = {} --local selectors = {}
@ -106,6 +107,37 @@ local function parse_compound_selector( tokeniser )
end end
--table.insert(selectors, {type = "id", value = name}) --table.insert(selectors, {type = "id", value = name})
selector.id = name selector.id = name
elseif char == "[" then
tokeniser.next() -- consume leading [
local name = tokeniser.read_identifier()
if tokeniser.peek() == "=" then
tokeniser.next()
if tokeniser.peek() ~= "\"" then
error("Expected opening quote \" at pos " .. tokeniser.pos() )
end
tokeniser.next() -- consume leading "
local value = ""
while tokeniser.peek() ~= "\"" do
value = value .. tokeniser.peek()
tokeniser.next()
end
tokeniser.next() -- consume trailing "
selector.attributes_values[name] = value
else
table.insert( selector.attributes_present, name )
end
if tokeniser.peek() ~= "]" then
error("Expected closing bracket (']') at " .. tokeniser.pos())
end
tokeniser.next() -- consume trailing ]
else else
break break
end end

163
html.lua
View File

@ -5,11 +5,11 @@ local function trim(str)
end end
local function shallow_copy(t) local function shallow_copy(t)
local t2 = {} local t2 = {}
for k,v in pairs(t) do for k,v in pairs(t) do
t2[k] = v t2[k] = v
end end
return t2 return t2
end end
@ -39,6 +39,62 @@ local VOID_TAGS = {
wbr = true, wbr = true,
} }
local INLINE_TAGS = {
-- Text formatting
a = true,
abbr = true,
b = true,
bdi = true,
bdo = true,
cite = true,
code = true,
data = true,
dfn = true,
em = true,
i = true,
kbd = true,
mark = true,
q = true,
ruby = true,
s = true,
samp = true,
small = true,
span = true,
strong = true,
sub = true,
sup = true,
time = true,
u = true,
var = true,
-- Interactive elements
button = true,
label = true,
select = true,
textarea = true,
-- Media/content
img = true,
picture = true,
map = true,
object = true,
-- Line break
br = true,
wbr = true,
-- Forms
input = true,
output = true,
progress = true,
meter = true,
-- Scripting
script = true,
noscript = true,
template = true,
}
function M.make_dom_element( tag_name, parent_elem ) function M.make_dom_element( tag_name, parent_elem )
local o = { local o = {
@ -86,6 +142,23 @@ function M.make_dom_element( tag_name, parent_elem )
for _, child in ipairs(self.children or {}) do for _, child in ipairs(self.children or {}) do
child:foreach( fn ) child:foreach( fn )
end end
end,
inner_text = function(self)
if self.tag_name == ":text" then
return self.content
end
local text = ""
for _, child in ipairs(self.children) do
text = text .. child:inner_text()
if not INLINE_TAGS[child.tag_name] then
text = text .. "\n"
end
end
return text
end end
} }
@ -313,45 +386,59 @@ end
function M.check_simple_selector(element, selector) function M.check_simple_selector(element, selector)
-- Skip text nodes -- Skip text nodes
if element.tag_name == ":text" then if element.tag_name == ":text" then
return false
end
-- Check tag name if specified
if selector.tag_name and element.tag_name ~= selector.tag_name then
return false
end
-- Check ID if specified
if selector.id and element.attributes.id ~= selector.id then
return false
end
-- Check classes if specified
if selector.class and #selector.class > 0 then
local element_classes = element.attributes.class
if not element_classes then
return false return false
end end
-- Check tag name if specified for _, class in ipairs(selector.class) do
if selector.tag_name and element.tag_name ~= selector.tag_name then local found = false
return false for _, elem_class in ipairs(element_classes) do
end if elem_class == class then
found = true
-- Check ID if specified break
if selector.id and element.attributes.id ~= selector.id then end
return false end
end if not found then
-- Check classes if specified
if selector.class and #selector.class > 0 then
local element_classes = element.attributes.class
if not element_classes then
return false return false
end end
for _, class in ipairs(selector.class) do
local found = false
for _, elem_class in ipairs(element_classes) do
if elem_class == class then
found = true
break
end
end
if not found then
return false
end
end
end end
return true
end end
for attr_name, attr_value in pairs(selector.attributes_values) do
local elem_attr_value = element.attributes[attr_name]
if elem_attr_value ~= attr_value then
return false
end
end
-- Check attribute presence selectors
for _, attr_name in ipairs(selector.attributes_present) do
if not element.attributes[attr_name] then
return false
end
end
return true
end
function M.query_simple_selector(document, selector) function M.query_simple_selector(document, selector)
local matches = {} local matches = {}
@ -524,7 +611,7 @@ function M.clean_text_nodes(node)
return return
end end
node.content = trim( node.content:gsub("%s+", " ") ) node.content = node.content:gsub("%s+", " ")
end end
@ -559,7 +646,7 @@ function M._tostring(node, indent, include_internal_pseudoelements)
local node_name = "" local node_name = ""
if not is_pseudo_element or include_internal_pseudoelements then if not is_pseudo_element or include_internal_pseudoelements then
-- Print the current node's tag name -- Print the current node's tag name
node_name = node_name .. "\n" .. indent_str .. "<" .. (node.tag_name or ":root") node_name = node_name .. "\n" .. indent_str .. "<" .. (node.tag_name or ":root")
end end

View File

@ -21,8 +21,9 @@ local function print_usage()
logger.print(" css_selector: CSS selector to search for") logger.print(" css_selector: CSS selector to search for")
logger.print() logger.print()
logger.print(" Flags:") logger.print(" Flags:")
logger.print(" -f, --first-only: return only the first match") logger.print(" -1, --first-only: return only the first match")
logger.print(" -q, --quiet: Don't print warnings") logger.print(" -q, --quiet: Don't print warnings")
logger.print(" -t, --text: Print only the innerText of the matched elements")
os.exit(1) os.exit(1)
end end
@ -32,17 +33,20 @@ end
local FLAGS = { local FLAGS = {
FIRST_ONLY = {}, FIRST_ONLY = {},
NO_PRINT_ERRORS = {}, DO_PRINT_ERRORS = {},
INNER_TEXT = {},
} }
local LONGHAND_FLAGS = { local LONGHAND_FLAGS = {
["first-only"] = FLAGS.FIRST_ONLY, ["first-only"] = FLAGS.FIRST_ONLY,
["quiet"] = FLAGS.NO_PRINT_ERRORS ["errors"] = FLAGS.DO_PRINT_ERRORS,
["text"] = FLAGS.INNER_TEXT,
} }
local SHORTHAND_FLAGS = { local SHORTHAND_FLAGS = {
["f"] = FLAGS.FIRST_ONLY, ["1"] = FLAGS.FIRST_ONLY,
["q"] = FLAGS.NO_PRINT_ERRORS, ["e"] = FLAGS.DO_PRINT_ERRORS,
["t"] = FLAGS.INNER_TEXT,
} }
@ -94,7 +98,7 @@ for _, argument in ipairs(arg) do
end end
if not flags[ FLAGS.NO_PRINT_ERRORS ] then if flags[ FLAGS.DO_PRINT_ERRORS ] then
logger.enable_printing_errors() logger.enable_printing_errors()
end end
@ -122,7 +126,7 @@ if html_file ~= "-" then
html = handle:read("a") html = handle:read("a")
else else
html = io.read() html = io.read("a")
end end
local document = HTML.parse( html ) local document = HTML.parse( html )
@ -210,6 +214,12 @@ end
if flags[FLAGS.FIRST_ONLY] then if flags[FLAGS.FIRST_ONLY] then
if #elements > 0 then if #elements > 0 then
if flags[FLAGS.INNER_TEXT] then
logger.print( elements[1]:inner_text() )
return 0
end
logger.print( HTML.tostring( elements[1] ) ) logger.print( HTML.tostring( elements[1] ) )
end end
@ -217,5 +227,9 @@ if flags[FLAGS.FIRST_ONLY] then
end end
for _, el in ipairs(elements) do for _, el in ipairs(elements) do
logger.print( HTML.tostring(el) ) if flags[FLAGS.INNER_TEXT] then
logger.print( el:inner_text() )
else
logger.print( HTML.tostring(el) )
end
end end