Compare commits
8 Commits
763484013c
...
b94a21cb2e
Author | SHA1 | Date |
---|---|---|
|
b94a21cb2e | |
|
1589f2f395 | |
|
19a8896978 | |
|
9fd8b3e15d | |
|
94f7694c1e | |
|
79d6a8f77d | |
|
c923159d7a | |
|
bd2b04216c |
|
@ -2,3 +2,5 @@
|
|||
*.luastatic.c
|
||||
# Compiled executable from main.lua
|
||||
main
|
||||
# Same but with correct name
|
||||
htmlq
|
||||
|
|
41
README.md
41
README.md
|
@ -41,15 +41,22 @@ Supported combinators are all the "basic" ones:
|
|||
|
||||
## Usage
|
||||
|
||||
```
|
||||
Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>
|
||||
html_path_or_minus: Path to HTML file or '-' for stdin
|
||||
css_selector: CSS selector to search for
|
||||
Once compiled, you can run Htmlq using the following command:
|
||||
|
||||
Flags:
|
||||
-f, --first-only: return only the first match
|
||||
-q, --quiet: Don't print warnings
|
||||
```
|
||||
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
|
||||
```
|
||||
|
||||
Where:
|
||||
|
||||
* `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
|
||||
* `<css_selector>` is the CSS selector you want to use to query the HTML.
|
||||
|
||||
### Flags
|
||||
|
||||
* `-1`, `--first-only`: Return only the first match
|
||||
* `-q`, `--quiet`: Don't print warnings
|
||||
* `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/docs/Web/API/HTMLElement/innerText) of the matched elements
|
||||
|
||||
## Motivation
|
||||
|
||||
|
@ -78,25 +85,7 @@ luarocks install luastatic
|
|||
Once `luastatic` is installed, you can compile Htmlq by running the following command in your terminal, from the project's root directory:
|
||||
|
||||
```
|
||||
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so
|
||||
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so -o htmlq
|
||||
```
|
||||
|
||||
Note that all `.lua` files from the project need to be specified, with `main.lua` as the first one. Also, the path to `liblua` may vary according to your system. The example provided is for an installation on EndeavourOS.
|
||||
|
||||
## Running
|
||||
|
||||
Once compiled, you can run Htmlq using the following command:
|
||||
|
||||
```
|
||||
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
|
||||
```
|
||||
|
||||
Where:
|
||||
|
||||
* `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
|
||||
* `<css_selector>` is the CSS selector you want to use to query the HTML.
|
||||
|
||||
### Flags
|
||||
|
||||
* `-f`, `--first-only`: Return only the first match
|
||||
* `-q`, `--quiet`: Don't print warnings
|
||||
|
|
34
css.lua
34
css.lua
|
@ -66,7 +66,8 @@ local function parse_compound_selector( tokeniser )
|
|||
tag_name = nil,
|
||||
id = nil,
|
||||
class = {},
|
||||
attributes = {},
|
||||
attributes_values = {},
|
||||
attributes_present = {},
|
||||
}
|
||||
|
||||
--local selectors = {}
|
||||
|
@ -106,6 +107,37 @@ local function parse_compound_selector( tokeniser )
|
|||
end
|
||||
--table.insert(selectors, {type = "id", value = name})
|
||||
selector.id = name
|
||||
elseif char == "[" then
|
||||
tokeniser.next() -- consume leading [
|
||||
|
||||
local name = tokeniser.read_identifier()
|
||||
|
||||
if tokeniser.peek() == "=" then
|
||||
tokeniser.next()
|
||||
|
||||
if tokeniser.peek() ~= "\"" then
|
||||
error("Expected opening quote \" at pos " .. tokeniser.pos() )
|
||||
end
|
||||
tokeniser.next() -- consume leading "
|
||||
|
||||
local value = ""
|
||||
while tokeniser.peek() ~= "\"" do
|
||||
value = value .. tokeniser.peek()
|
||||
tokeniser.next()
|
||||
end
|
||||
|
||||
tokeniser.next() -- consume trailing "
|
||||
|
||||
selector.attributes_values[name] = value
|
||||
else
|
||||
table.insert( selector.attributes_present, name )
|
||||
end
|
||||
|
||||
if tokeniser.peek() ~= "]" then
|
||||
error("Expected closing bracket (']') at " .. tokeniser.pos())
|
||||
end
|
||||
|
||||
tokeniser.next() -- consume trailing ]
|
||||
else
|
||||
break
|
||||
end
|
||||
|
|
91
html.lua
91
html.lua
|
@ -39,6 +39,62 @@ local VOID_TAGS = {
|
|||
wbr = true,
|
||||
}
|
||||
|
||||
local INLINE_TAGS = {
|
||||
-- Text formatting
|
||||
a = true,
|
||||
abbr = true,
|
||||
b = true,
|
||||
bdi = true,
|
||||
bdo = true,
|
||||
cite = true,
|
||||
code = true,
|
||||
data = true,
|
||||
dfn = true,
|
||||
em = true,
|
||||
i = true,
|
||||
kbd = true,
|
||||
mark = true,
|
||||
q = true,
|
||||
ruby = true,
|
||||
s = true,
|
||||
samp = true,
|
||||
small = true,
|
||||
span = true,
|
||||
strong = true,
|
||||
sub = true,
|
||||
sup = true,
|
||||
time = true,
|
||||
u = true,
|
||||
var = true,
|
||||
|
||||
-- Interactive elements
|
||||
button = true,
|
||||
label = true,
|
||||
select = true,
|
||||
textarea = true,
|
||||
|
||||
-- Media/content
|
||||
img = true,
|
||||
picture = true,
|
||||
map = true,
|
||||
object = true,
|
||||
|
||||
-- Line break
|
||||
br = true,
|
||||
wbr = true,
|
||||
|
||||
-- Forms
|
||||
input = true,
|
||||
output = true,
|
||||
progress = true,
|
||||
meter = true,
|
||||
|
||||
-- Scripting
|
||||
script = true,
|
||||
noscript = true,
|
||||
template = true,
|
||||
}
|
||||
|
||||
|
||||
function M.make_dom_element( tag_name, parent_elem )
|
||||
local o = {
|
||||
|
@ -86,6 +142,23 @@ function M.make_dom_element( tag_name, parent_elem )
|
|||
for _, child in ipairs(self.children or {}) do
|
||||
child:foreach( fn )
|
||||
end
|
||||
end,
|
||||
|
||||
inner_text = function(self)
|
||||
if self.tag_name == ":text" then
|
||||
return self.content
|
||||
end
|
||||
|
||||
local text = ""
|
||||
for _, child in ipairs(self.children) do
|
||||
text = text .. child:inner_text()
|
||||
|
||||
if not INLINE_TAGS[child.tag_name] then
|
||||
text = text .. "\n"
|
||||
end
|
||||
end
|
||||
|
||||
return text
|
||||
end
|
||||
}
|
||||
|
||||
|
@ -349,8 +422,22 @@ function M.check_simple_selector(element, selector)
|
|||
end
|
||||
end
|
||||
|
||||
return true
|
||||
for attr_name, attr_value in pairs(selector.attributes_values) do
|
||||
local elem_attr_value = element.attributes[attr_name]
|
||||
if elem_attr_value ~= attr_value then
|
||||
return false
|
||||
end
|
||||
end
|
||||
|
||||
-- Check attribute presence selectors
|
||||
for _, attr_name in ipairs(selector.attributes_present) do
|
||||
if not element.attributes[attr_name] then
|
||||
return false
|
||||
end
|
||||
end
|
||||
|
||||
return true
|
||||
end
|
||||
|
||||
function M.query_simple_selector(document, selector)
|
||||
local matches = {}
|
||||
|
@ -524,7 +611,7 @@ function M.clean_text_nodes(node)
|
|||
return
|
||||
end
|
||||
|
||||
node.content = trim( node.content:gsub("%s+", " ") )
|
||||
node.content = node.content:gsub("%s+", " ")
|
||||
end
|
||||
|
||||
|
||||
|
|
28
main.lua
28
main.lua
|
@ -21,8 +21,9 @@ local function print_usage()
|
|||
logger.print(" css_selector: CSS selector to search for")
|
||||
logger.print()
|
||||
logger.print(" Flags:")
|
||||
logger.print(" -f, --first-only: return only the first match")
|
||||
logger.print(" -1, --first-only: return only the first match")
|
||||
logger.print(" -q, --quiet: Don't print warnings")
|
||||
logger.print(" -t, --text: Print only the innerText of the matched elements")
|
||||
os.exit(1)
|
||||
end
|
||||
|
||||
|
@ -32,17 +33,20 @@ end
|
|||
|
||||
local FLAGS = {
|
||||
FIRST_ONLY = {},
|
||||
NO_PRINT_ERRORS = {},
|
||||
DO_PRINT_ERRORS = {},
|
||||
INNER_TEXT = {},
|
||||
}
|
||||
|
||||
local LONGHAND_FLAGS = {
|
||||
["first-only"] = FLAGS.FIRST_ONLY,
|
||||
["quiet"] = FLAGS.NO_PRINT_ERRORS
|
||||
["errors"] = FLAGS.DO_PRINT_ERRORS,
|
||||
["text"] = FLAGS.INNER_TEXT,
|
||||
}
|
||||
|
||||
local SHORTHAND_FLAGS = {
|
||||
["f"] = FLAGS.FIRST_ONLY,
|
||||
["q"] = FLAGS.NO_PRINT_ERRORS,
|
||||
["1"] = FLAGS.FIRST_ONLY,
|
||||
["e"] = FLAGS.DO_PRINT_ERRORS,
|
||||
["t"] = FLAGS.INNER_TEXT,
|
||||
}
|
||||
|
||||
|
||||
|
@ -94,7 +98,7 @@ for _, argument in ipairs(arg) do
|
|||
end
|
||||
|
||||
|
||||
if not flags[ FLAGS.NO_PRINT_ERRORS ] then
|
||||
if flags[ FLAGS.DO_PRINT_ERRORS ] then
|
||||
logger.enable_printing_errors()
|
||||
end
|
||||
|
||||
|
@ -122,7 +126,7 @@ if html_file ~= "-" then
|
|||
|
||||
html = handle:read("a")
|
||||
else
|
||||
html = io.read()
|
||||
html = io.read("a")
|
||||
end
|
||||
|
||||
local document = HTML.parse( html )
|
||||
|
@ -210,6 +214,12 @@ end
|
|||
|
||||
if flags[FLAGS.FIRST_ONLY] then
|
||||
if #elements > 0 then
|
||||
|
||||
if flags[FLAGS.INNER_TEXT] then
|
||||
logger.print( elements[1]:inner_text() )
|
||||
return 0
|
||||
end
|
||||
|
||||
logger.print( HTML.tostring( elements[1] ) )
|
||||
end
|
||||
|
||||
|
@ -217,5 +227,9 @@ if flags[FLAGS.FIRST_ONLY] then
|
|||
end
|
||||
|
||||
for _, el in ipairs(elements) do
|
||||
if flags[FLAGS.INNER_TEXT] then
|
||||
logger.print( el:inner_text() )
|
||||
else
|
||||
logger.print( HTML.tostring(el) )
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue