Compare commits
7 Commits
e9c2553f88
...
abacff6ec9
| Author | SHA1 | Date |
|---|---|---|
|
|
abacff6ec9 | |
|
|
fc3155e6d7 | |
|
|
fd06af5983 | |
|
|
da51b3697e | |
|
|
f5a4a81147 | |
|
|
a8a295aaf1 | |
|
|
feb98ab5ab |
|
|
@ -0,0 +1,4 @@
|
|||
# Luastatic-generated file
|
||||
*.luastatic.c
|
||||
# Compiled executable from main.lua
|
||||
main
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
|
||||
# Htmlq
|
||||
|
||||
## Overview
|
||||
|
||||
> In short: [jq](https://jqlang.github.io/jq/), but for HTML
|
||||
|
||||
This project is a from-scratch implementation of an HTML and CSS parser, written entirely in Lua. No external dependencies. It's designed to take HTML and CSS as input and provide a way to query the Document Object Model (DOM) using CSS selectors.
|
||||
|
||||
## Features
|
||||
|
||||
There's really only one feature: it takes in HTML and a CSS selector, and returns whatever is matched by that selector in the DOM.
|
||||
|
||||
Supported simple selectors:
|
||||
* **tag name** - `h1`
|
||||
* **class** - `.class`
|
||||
* **id** - `#id`
|
||||
|
||||
And any _compound_ selector (like `p.text-center.bold` matching all `p`s that have the `text-center` and `bold` class)
|
||||
|
||||
|
||||
Supported combinators are all the "basic" ones:
|
||||
* ` ` - the [descendant combinator](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#descendant_combinator)
|
||||
* `>` - the [child combinator](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#child_combinator)
|
||||
* `+` - the [next sibling combinator](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#next-sibling_combinator)
|
||||
* `~` - the [subsequent sibling](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#subsequent-sibling_combinator)
|
||||
|
||||
|
||||
### Limitations
|
||||
|
||||
* The [column](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors/Selectors_and_combinators#column_combinator) and [namespace](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors/Selectors_and_combinators#namespace_separator) combinators are **not** supported
|
||||
* **Here be dragons**: This tool was written by someone who is not especially good at writing parsers ; It may break or behave unexpectedly. Don't hesitate to report issues !
|
||||
* This tool was not designed with speed in mind ; it seems _fast enough_ for common CLI usage purposes.
|
||||
|
||||
### TODO
|
||||
|
||||
- [ ] Universal selector (`*` to match any element)
|
||||
- [ ] Attribute selectors (`[attr="value"]`)
|
||||
- [ ] A way to "group" selectors, e.g. `aside {p, footer}` to select all `p`s and `footer`s in `aside`s ?
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>
|
||||
html_path_or_minus: Path to HTML file or '-' for stdin
|
||||
css_selector: CSS selector to search for
|
||||
|
||||
Flags:
|
||||
-f, --first-only: return only the first match
|
||||
-q, --quiet: Don't print warnings
|
||||
```
|
||||
|
||||
## Motivation
|
||||
|
||||
I needed this for a specific need of mine, where I wanted to systematically extract the HTML starting with an element with a certain id, up to the closing tag. While I could probably have hacked something together for this one-time use case, in typical programmer spirit, I decided to create a tool.
|
||||
|
||||
This is my first parser, and it was very fun!
|
||||
Writing a parser seems to be a kind of "rite of passage" for programmers, and now I did it too.
|
||||
|
||||
Obviously, this could have been solved with `jsdom` and like 10 lines of JS.
|
||||
|
||||
Plus, it's kinda neat to have a lightweight, dependency-free way to mess with web stuff in Lua.
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
Htmlq is written in Lua and requires no external dependencies. To use it, you will need to have Lua installed on your system. You can check if Lua is installed by running `lua -v` in your terminal. If Lua is not installed, you can install it from your distribution's package manager or from the official Lua website.
|
||||
|
||||
## Compiling
|
||||
|
||||
To compile Htmlq, you will need to use `luastatic`. You can install `luastatic` via `luarocks` by running the following command:
|
||||
|
||||
```
|
||||
luarocks install luastatic
|
||||
```
|
||||
|
||||
Once `luastatic` is installed, you can compile Htmlq by running the following command in your terminal, from the project's root directory:
|
||||
|
||||
```
|
||||
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so
|
||||
```
|
||||
|
||||
Note that all `.lua` files from the project need to be specified, with `main.lua` as the first one. Also, the path to `liblua` may vary according to your system. The example provided is for an installation on EndeavourOS.
|
||||
|
||||
## Running
|
||||
|
||||
Once compiled, you can run Htmlq using the following command:
|
||||
|
||||
```
|
||||
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
|
||||
```
|
||||
|
||||
Where:
|
||||
|
||||
* `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
|
||||
* `<css_selector>` is the CSS selector you want to use to query the HTML.
|
||||
|
||||
### Flags
|
||||
|
||||
* `-f`, `--first-only`: Return only the first match
|
||||
* `-q`, `--quiet`: Don't print warnings
|
||||
23
html.lua
23
html.lua
|
|
@ -1,3 +1,4 @@
|
|||
local logger = require(".logging")
|
||||
|
||||
local function trim(str)
|
||||
return str:match("^%s*(.-)%s*$")
|
||||
|
|
@ -232,13 +233,15 @@ function M.tokenise( content )
|
|||
|
||||
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
|
||||
if RAW_TEXT_TAGS[word] then
|
||||
print("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
-- made possible because of the whitespace removal at the start
|
||||
set_skipping_to("</" .. word)
|
||||
end
|
||||
end
|
||||
|
||||
if not word:match("^%s*$") then
|
||||
table.insert( TOKENS, {type="WORD", value=word})
|
||||
end
|
||||
else
|
||||
table.insert( TOKENS, {type="TEXT", value=text_memory} )
|
||||
end
|
||||
|
|
@ -261,7 +264,7 @@ function M.tokenise( content )
|
|||
|
||||
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
|
||||
if RAW_TEXT_TAGS[word] then
|
||||
print("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
|
||||
-- made possible because of the whitespace removal at the start
|
||||
set_skipping_to("</" .. word)
|
||||
text_memory = ""
|
||||
|
|
@ -271,8 +274,10 @@ function M.tokenise( content )
|
|||
end
|
||||
end
|
||||
|
||||
if not word:match("^%s*$") then
|
||||
table.insert( TOKENS, {type="WORD", value=word})
|
||||
text_memory = ""
|
||||
end
|
||||
|
||||
goto continue
|
||||
end
|
||||
|
|
@ -397,7 +402,7 @@ function M.parse_tokens_into_document( TOKENS )
|
|||
|
||||
if curr_elem.parent == nil then
|
||||
-- reached DOCUMENT root
|
||||
print("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
|
||||
logger.printerr("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
|
||||
current_doc_element = DOCUMENT
|
||||
else
|
||||
current_doc_element = curr_elem.parent
|
||||
|
|
@ -418,7 +423,7 @@ function M.parse_tokens_into_document( TOKENS )
|
|||
name = token.value:match("([%w-]+)")
|
||||
|
||||
if name == nil then
|
||||
error("Unrecognised word: " .. name)
|
||||
error("Unrecognised word: " .. tostring(name) .. " (Token ".. tostring(i) .." , type=" .. tostring(token.type) .. ", value=" .. tostring(token.value) .. ")")
|
||||
end
|
||||
|
||||
current_doc_element.attributes[name] = true
|
||||
|
|
@ -523,7 +528,7 @@ function M.clean_text_nodes(node)
|
|||
end
|
||||
|
||||
|
||||
function M.tostring(node, indent, include_internal_pseudoelements)
|
||||
function M._tostring(node, indent, include_internal_pseudoelements)
|
||||
-- Default indentation is 0 (root level)
|
||||
indent = indent or 0
|
||||
include_internal_pseudoelements = include_internal_pseudoelements or false
|
||||
|
|
@ -578,8 +583,6 @@ function M.tostring(node, indent, include_internal_pseudoelements)
|
|||
node_name = node_name .. ">"
|
||||
end
|
||||
|
||||
--print( node_name )
|
||||
|
||||
local next_indent = indent + 1
|
||||
if is_pseudo_element and not include_internal_pseudoelements then
|
||||
next_indent = indent
|
||||
|
|
@ -587,7 +590,7 @@ function M.tostring(node, indent, include_internal_pseudoelements)
|
|||
|
||||
-- Recursively print children
|
||||
for _, child in ipairs(node.children) do
|
||||
node_name = node_name .. M.tostring(child, next_indent, include_internal_pseudoelements)
|
||||
node_name = node_name .. M._tostring(child, next_indent, include_internal_pseudoelements)
|
||||
end
|
||||
|
||||
if not VOID_TAGS[node.tag_name] and ( not is_pseudo_element or include_internal_pseudoelements ) then
|
||||
|
|
@ -603,7 +606,9 @@ function M.tostring(node, indent, include_internal_pseudoelements)
|
|||
return node_name
|
||||
end
|
||||
|
||||
|
||||
function M.tostring(node, base_indent, include_internal_pseudoelements)
|
||||
return trim( M._tostring(node, base_indent, include_internal_pseudoelements) )
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
local may_print_errors = false
|
||||
local errors_buffer = {}
|
||||
|
||||
local logger = {
|
||||
print = function( str )
|
||||
print( str or "" )
|
||||
end,
|
||||
printerr = function( str)
|
||||
str = str or ""
|
||||
if str:sub(#str,#str) ~= "\n" then
|
||||
str = str .. "\n"
|
||||
end
|
||||
|
||||
if not may_print_errors then
|
||||
table.insert(errors_buffer, str)
|
||||
return
|
||||
end
|
||||
|
||||
io.stderr:write(str)
|
||||
end,
|
||||
enable_printing_errors = function()
|
||||
may_print_errors = true
|
||||
|
||||
for _, err in ipairs(errors_buffer) do
|
||||
io.stderr:write(err)
|
||||
end
|
||||
end,
|
||||
}
|
||||
|
||||
|
||||
return logger
|
||||
156
main.lua
156
main.lua
|
|
@ -1,28 +1,140 @@
|
|||
#!/bin/env lua
|
||||
|
||||
local html = require(".html")
|
||||
local css = require(".css")
|
||||
local HTML = require(".html")
|
||||
local CSS = require(".css")
|
||||
|
||||
local logger = require(".logging")
|
||||
|
||||
|
||||
local file = io.open("small.html", "r")
|
||||
|
||||
if file == nil then
|
||||
error("File doesn't exist")
|
||||
|
||||
local function file_exists(name)
|
||||
local f=io.open(name,"r")
|
||||
if f~=nil then io.close(f) return true else return false end
|
||||
end
|
||||
|
||||
local content = file:read("a")
|
||||
|
||||
local doc = html.parse( content )
|
||||
|
||||
|
||||
print("Write a css selector:")
|
||||
local whole_selector = css.parse( io.read() )
|
||||
local current_selector = whole_selector
|
||||
local function print_usage()
|
||||
logger.print("Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>")
|
||||
logger.print(" html_path_or_minus: Path to HTML file or '-' for stdin")
|
||||
logger.print(" css_selector: CSS selector to search for")
|
||||
logger.print()
|
||||
logger.print(" Flags:")
|
||||
logger.print(" -f, --first-only: return only the first match")
|
||||
logger.print(" -q, --quiet: Don't print warnings")
|
||||
os.exit(1)
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
local FLAGS = {
|
||||
FIRST_ONLY = {},
|
||||
NO_PRINT_ERRORS = {},
|
||||
}
|
||||
|
||||
local LONGHAND_FLAGS = {
|
||||
["first-only"] = FLAGS.FIRST_ONLY,
|
||||
["quiet"] = FLAGS.NO_PRINT_ERRORS
|
||||
}
|
||||
|
||||
local SHORTHAND_FLAGS = {
|
||||
["f"] = FLAGS.FIRST_ONLY,
|
||||
["q"] = FLAGS.NO_PRINT_ERRORS,
|
||||
}
|
||||
|
||||
|
||||
|
||||
if #arg < 2 then
|
||||
logger.printerr("Error: Not enough arguments")
|
||||
print_usage()
|
||||
return 1
|
||||
end
|
||||
|
||||
local flags = {}
|
||||
local positionals = {}
|
||||
|
||||
for _, argument in ipairs(arg) do
|
||||
if argument:match("^%-%w+$") then
|
||||
for letter in argument:sub(2):gmatch("(%w)") do
|
||||
if not SHORTHAND_FLAGS[letter] then
|
||||
logger.printerr("Unknown flag: -"..letter..".")
|
||||
print_usage()
|
||||
return 1
|
||||
end
|
||||
|
||||
local flag = SHORTHAND_FLAGS[letter]
|
||||
|
||||
if flags[flag] then
|
||||
logger.printerr("Warning: passed -" .. letter .. " flag already !")
|
||||
end
|
||||
|
||||
flags[flag] = true
|
||||
end
|
||||
elseif argument:match("^%-%-[%w%-]+$") then
|
||||
local flagname = argument:sub(3)
|
||||
if not LONGHAND_FLAGS[flagname] then
|
||||
logger.printerr("Unknown flag: --"..flagname..".")
|
||||
print_usage()
|
||||
return 1
|
||||
end
|
||||
|
||||
local flag = LONGHAND_FLAGS[flagname]
|
||||
|
||||
if flags[flag] then
|
||||
logger.printerr("Warning: passed --" .. flagname .. " flag already !")
|
||||
end
|
||||
|
||||
flags[flag] = true
|
||||
else
|
||||
table.insert( positionals, argument )
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
if not flags[ FLAGS.NO_PRINT_ERRORS ] then
|
||||
logger.enable_printing_errors()
|
||||
end
|
||||
|
||||
|
||||
if #positionals > 2 then
|
||||
logger.printerr("Error: too many arguments !")
|
||||
print_usage()
|
||||
return 1
|
||||
end
|
||||
|
||||
local html_file = positionals[1]
|
||||
local html = nil
|
||||
|
||||
if html_file ~= "-" then
|
||||
if not( file_exists( html_file )) then
|
||||
logger.printerr("File doesn't exist: " .. html_file)
|
||||
return 2
|
||||
end
|
||||
|
||||
local handle = io.open( html_file, "r" )
|
||||
if not handle then
|
||||
logger.printerr("Failed to open file " .. html_file)
|
||||
return 2
|
||||
end
|
||||
|
||||
html = handle:read("a")
|
||||
else
|
||||
html = io.read()
|
||||
end
|
||||
|
||||
local document = HTML.parse( html )
|
||||
local css_selector = CSS.parse( positionals[2] )
|
||||
|
||||
|
||||
local current_selector = css_selector
|
||||
|
||||
|
||||
local elements = {}
|
||||
-- start with all elements matching the first selector
|
||||
doc:foreach(function( el )
|
||||
document:foreach(function( el )
|
||||
if el:check_simple_selector( current_selector.selector ) then
|
||||
table.insert( elements, el )
|
||||
end
|
||||
|
|
@ -33,7 +145,7 @@ while current_selector.combinator ~= nil do
|
|||
|
||||
local new_elements = {}
|
||||
|
||||
if current_selector.combinator == css.COMBINATORS.DESCENDANT then
|
||||
if current_selector.combinator == CSS.COMBINATORS.DESCENDANT then
|
||||
for _, element in ipairs( elements ) do
|
||||
element:foreach(function( el )
|
||||
if el:check_simple_selector( next_selector.selector ) then
|
||||
|
|
@ -45,7 +157,7 @@ while current_selector.combinator ~= nil do
|
|||
goto continue
|
||||
end
|
||||
|
||||
if current_selector.combinator == css.COMBINATORS.DIRECT_DESCENDANT then
|
||||
if current_selector.combinator == CSS.COMBINATORS.DIRECT_DESCENDANT then
|
||||
for _, element in ipairs( elements ) do
|
||||
for _, child in ipairs( element.children ) do
|
||||
if child:check_simple_selector( next_selector.selector ) then
|
||||
|
|
@ -57,7 +169,7 @@ while current_selector.combinator ~= nil do
|
|||
goto continue
|
||||
end
|
||||
|
||||
if current_selector.combinator == css.COMBINATORS.NEXT_SIBLING then
|
||||
if current_selector.combinator == CSS.COMBINATORS.NEXT_SIBLING then
|
||||
for _, element in ipairs( elements ) do
|
||||
local next_sibling = element:get_next_sibling()
|
||||
while next_sibling and next_sibling.tag_name == ":text" do
|
||||
|
|
@ -72,7 +184,7 @@ while current_selector.combinator ~= nil do
|
|||
goto continue
|
||||
end
|
||||
|
||||
if current_selector.combinator == css.COMBINATORS.SUBSEQUENT_SIBLING then
|
||||
if current_selector.combinator == CSS.COMBINATORS.SUBSEQUENT_SIBLING then
|
||||
for _, element in ipairs( elements ) do
|
||||
local sibling = element:get_next_sibling()
|
||||
while sibling ~= nil do
|
||||
|
|
@ -96,10 +208,14 @@ end
|
|||
|
||||
|
||||
|
||||
|
||||
for _, el in ipairs(elements) do
|
||||
print( html.tostring( el ) )
|
||||
if flags[FLAGS.FIRST_ONLY] then
|
||||
if #elements > 0 then
|
||||
logger.print( HTML.tostring( elements[1] ) )
|
||||
end
|
||||
|
||||
return 0
|
||||
end
|
||||
|
||||
|
||||
for _, el in ipairs(elements) do
|
||||
logger.print( HTML.tostring(el) )
|
||||
end
|
||||
|
|
|
|||
Loading…
Reference in New Issue