Compare commits

..

7 Commits

5 changed files with 290 additions and 32 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
# Luastatic-generated file
*.luastatic.c
# Compiled executable from main.lua
main

101
README.md Normal file
View File

@ -0,0 +1,101 @@
# Htmlq
## Overview
> In short: [jq](https://jqlang.github.io/jq/), but for HTML
This project is a from-scratch implementation of an HTML and CSS parser, written entirely in Lua. No external dependencies. It's designed to take HTML and CSS as input and provide a way to query the Document Object Model (DOM) using CSS selectors.
## Features
There's really only one feature: it takes in HTML and a CSS selector, and returns whatever is matched by that selector in the DOM.
Supported simple selectors:
* **tag name** - `h1`
* **class** - `.class`
* **id** - `#id`
And any _compound_ selector (like `p.text-center.bold` matching all `p`s that have the `text-center` and `bold` class)
Supported combinators are all the "basic" ones:
* ` ` - the [descendant combinator](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#descendant_combinator)
* `>` - the [child combinator](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#child_combinator)
* `+` - the [next sibling combinator](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#next-sibling_combinator)
* `~` - the [subsequent sibling](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Combinators#subsequent-sibling_combinator)
### Limitations
* The [column](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors/Selectors_and_combinators#column_combinator) and [namespace](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors/Selectors_and_combinators#namespace_separator) combinators are **not** supported
* **Here be dragons**: This tool was written by someone who is not especially good at writing parsers ; It may break or behave unexpectedly. Don't hesitate to report issues !
* This tool was not designed with speed in mind ; it seems _fast enough_ for common CLI usage purposes.
### TODO
- [ ] Universal selector (`*` to match any element)
- [ ] Attribute selectors (`[attr="value"]`)
- [ ] A way to "group" selectors, e.g. `aside {p, footer}` to select all `p`s and `footer`s in `aside`s ?
## Usage
```
Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>
html_path_or_minus: Path to HTML file or '-' for stdin
css_selector: CSS selector to search for
Flags:
-f, --first-only: return only the first match
-q, --quiet: Don't print warnings
```
## Motivation
I needed this for a specific need of mine, where I wanted to systematically extract the HTML starting with an element with a certain id, up to the closing tag. While I could probably have hacked something together for this one-time use case, in typical programmer spirit, I decided to create a tool.
This is my first parser, and it was very fun!
Writing a parser seems to be a kind of "rite of passage" for programmers, and now I did it too.
Obviously, this could have been solved with `jsdom` and like 10 lines of JS.
Plus, it's kinda neat to have a lightweight, dependency-free way to mess with web stuff in Lua.
## Installation
Htmlq is written in Lua and requires no external dependencies. To use it, you will need to have Lua installed on your system. You can check if Lua is installed by running `lua -v` in your terminal. If Lua is not installed, you can install it from your distribution's package manager or from the official Lua website.
## Compiling
To compile Htmlq, you will need to use `luastatic`. You can install `luastatic` via `luarocks` by running the following command:
```
luarocks install luastatic
```
Once `luastatic` is installed, you can compile Htmlq by running the following command in your terminal, from the project's root directory:
```
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so
```
Note that all `.lua` files from the project need to be specified, with `main.lua` as the first one. Also, the path to `liblua` may vary according to your system. The example provided is for an installation on EndeavourOS.
## Running
Once compiled, you can run Htmlq using the following command:
```
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
```
Where:
* `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
* `<css_selector>` is the CSS selector you want to use to query the HTML.
### Flags
* `-f`, `--first-only`: Return only the first match
* `-q`, `--quiet`: Don't print warnings

View File

@ -1,3 +1,4 @@
local logger = require(".logging")
local function trim(str)
return str:match("^%s*(.-)%s*$")
@ -232,13 +233,15 @@ function M.tokenise( content )
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG") then
if RAW_TEXT_TAGS[word] then
print("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
-- made possible because of the whitespace removal at the start
set_skipping_to("</" .. word)
end
end
table.insert( TOKENS, {type="WORD", value=word})
if not word:match("^%s*$") then
table.insert( TOKENS, {type="WORD", value=word})
end
else
table.insert( TOKENS, {type="TEXT", value=text_memory} )
end
@ -261,7 +264,7 @@ function M.tokenise( content )
if TOKENS[#TOKENS] and ( TOKENS[#TOKENS].type == "START_OPENING_TAG" ) then
if RAW_TEXT_TAGS[word] then
print("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.")
-- made possible because of the whitespace removal at the start
set_skipping_to("</" .. word)
text_memory = ""
@ -271,8 +274,10 @@ function M.tokenise( content )
end
end
table.insert( TOKENS, {type="WORD", value=word})
text_memory = ""
if not word:match("^%s*$") then
table.insert( TOKENS, {type="WORD", value=word})
text_memory = ""
end
goto continue
end
@ -397,7 +402,7 @@ function M.parse_tokens_into_document( TOKENS )
if curr_elem.parent == nil then
-- reached DOCUMENT root
print("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
logger.printerr("Warning: reached document root while trying to match for closing " .. token.value .. " token.")
current_doc_element = DOCUMENT
else
current_doc_element = curr_elem.parent
@ -418,7 +423,7 @@ function M.parse_tokens_into_document( TOKENS )
name = token.value:match("([%w-]+)")
if name == nil then
error("Unrecognised word: " .. name)
error("Unrecognised word: " .. tostring(name) .. " (Token ".. tostring(i) .." , type=" .. tostring(token.type) .. ", value=" .. tostring(token.value) .. ")")
end
current_doc_element.attributes[name] = true
@ -523,7 +528,7 @@ function M.clean_text_nodes(node)
end
function M.tostring(node, indent, include_internal_pseudoelements)
function M._tostring(node, indent, include_internal_pseudoelements)
-- Default indentation is 0 (root level)
indent = indent or 0
include_internal_pseudoelements = include_internal_pseudoelements or false
@ -578,8 +583,6 @@ function M.tostring(node, indent, include_internal_pseudoelements)
node_name = node_name .. ">"
end
--print( node_name )
local next_indent = indent + 1
if is_pseudo_element and not include_internal_pseudoelements then
next_indent = indent
@ -587,7 +590,7 @@ function M.tostring(node, indent, include_internal_pseudoelements)
-- Recursively print children
for _, child in ipairs(node.children) do
node_name = node_name .. M.tostring(child, next_indent, include_internal_pseudoelements)
node_name = node_name .. M._tostring(child, next_indent, include_internal_pseudoelements)
end
if not VOID_TAGS[node.tag_name] and ( not is_pseudo_element or include_internal_pseudoelements ) then
@ -603,7 +606,9 @@ function M.tostring(node, indent, include_internal_pseudoelements)
return node_name
end
function M.tostring(node, base_indent, include_internal_pseudoelements)
return trim( M._tostring(node, base_indent, include_internal_pseudoelements) )
end

32
logging.lua Normal file
View File

@ -0,0 +1,32 @@
local may_print_errors = false
local errors_buffer = {}
local logger = {
print = function( str )
print( str or "" )
end,
printerr = function( str)
str = str or ""
if str:sub(#str,#str) ~= "\n" then
str = str .. "\n"
end
if not may_print_errors then
table.insert(errors_buffer, str)
return
end
io.stderr:write(str)
end,
enable_printing_errors = function()
may_print_errors = true
for _, err in ipairs(errors_buffer) do
io.stderr:write(err)
end
end,
}
return logger

156
main.lua
View File

@ -1,28 +1,140 @@
#!/bin/env lua
local html = require(".html")
local css = require(".css")
local HTML = require(".html")
local CSS = require(".css")
local logger = require(".logging")
local file = io.open("small.html", "r")
if file == nil then
error("File doesn't exist")
local function file_exists(name)
local f=io.open(name,"r")
if f~=nil then io.close(f) return true else return false end
end
local content = file:read("a")
local doc = html.parse( content )
print("Write a css selector:")
local whole_selector = css.parse( io.read() )
local current_selector = whole_selector
local function print_usage()
logger.print("Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>")
logger.print(" html_path_or_minus: Path to HTML file or '-' for stdin")
logger.print(" css_selector: CSS selector to search for")
logger.print()
logger.print(" Flags:")
logger.print(" -f, --first-only: return only the first match")
logger.print(" -q, --quiet: Don't print warnings")
os.exit(1)
end
local FLAGS = {
FIRST_ONLY = {},
NO_PRINT_ERRORS = {},
}
local LONGHAND_FLAGS = {
["first-only"] = FLAGS.FIRST_ONLY,
["quiet"] = FLAGS.NO_PRINT_ERRORS
}
local SHORTHAND_FLAGS = {
["f"] = FLAGS.FIRST_ONLY,
["q"] = FLAGS.NO_PRINT_ERRORS,
}
if #arg < 2 then
logger.printerr("Error: Not enough arguments")
print_usage()
return 1
end
local flags = {}
local positionals = {}
for _, argument in ipairs(arg) do
if argument:match("^%-%w+$") then
for letter in argument:sub(2):gmatch("(%w)") do
if not SHORTHAND_FLAGS[letter] then
logger.printerr("Unknown flag: -"..letter..".")
print_usage()
return 1
end
local flag = SHORTHAND_FLAGS[letter]
if flags[flag] then
logger.printerr("Warning: passed -" .. letter .. " flag already !")
end
flags[flag] = true
end
elseif argument:match("^%-%-[%w%-]+$") then
local flagname = argument:sub(3)
if not LONGHAND_FLAGS[flagname] then
logger.printerr("Unknown flag: --"..flagname..".")
print_usage()
return 1
end
local flag = LONGHAND_FLAGS[flagname]
if flags[flag] then
logger.printerr("Warning: passed --" .. flagname .. " flag already !")
end
flags[flag] = true
else
table.insert( positionals, argument )
end
end
if not flags[ FLAGS.NO_PRINT_ERRORS ] then
logger.enable_printing_errors()
end
if #positionals > 2 then
logger.printerr("Error: too many arguments !")
print_usage()
return 1
end
local html_file = positionals[1]
local html = nil
if html_file ~= "-" then
if not( file_exists( html_file )) then
logger.printerr("File doesn't exist: " .. html_file)
return 2
end
local handle = io.open( html_file, "r" )
if not handle then
logger.printerr("Failed to open file " .. html_file)
return 2
end
html = handle:read("a")
else
html = io.read()
end
local document = HTML.parse( html )
local css_selector = CSS.parse( positionals[2] )
local current_selector = css_selector
local elements = {}
-- start with all elements matching the first selector
doc:foreach(function( el )
document:foreach(function( el )
if el:check_simple_selector( current_selector.selector ) then
table.insert( elements, el )
end
@ -33,7 +145,7 @@ while current_selector.combinator ~= nil do
local new_elements = {}
if current_selector.combinator == css.COMBINATORS.DESCENDANT then
if current_selector.combinator == CSS.COMBINATORS.DESCENDANT then
for _, element in ipairs( elements ) do
element:foreach(function( el )
if el:check_simple_selector( next_selector.selector ) then
@ -45,7 +157,7 @@ while current_selector.combinator ~= nil do
goto continue
end
if current_selector.combinator == css.COMBINATORS.DIRECT_DESCENDANT then
if current_selector.combinator == CSS.COMBINATORS.DIRECT_DESCENDANT then
for _, element in ipairs( elements ) do
for _, child in ipairs( element.children ) do
if child:check_simple_selector( next_selector.selector ) then
@ -57,7 +169,7 @@ while current_selector.combinator ~= nil do
goto continue
end
if current_selector.combinator == css.COMBINATORS.NEXT_SIBLING then
if current_selector.combinator == CSS.COMBINATORS.NEXT_SIBLING then
for _, element in ipairs( elements ) do
local next_sibling = element:get_next_sibling()
while next_sibling and next_sibling.tag_name == ":text" do
@ -72,7 +184,7 @@ while current_selector.combinator ~= nil do
goto continue
end
if current_selector.combinator == css.COMBINATORS.SUBSEQUENT_SIBLING then
if current_selector.combinator == CSS.COMBINATORS.SUBSEQUENT_SIBLING then
for _, element in ipairs( elements ) do
local sibling = element:get_next_sibling()
while sibling ~= nil do
@ -96,10 +208,14 @@ end
if flags[FLAGS.FIRST_ONLY] then
if #elements > 0 then
logger.print( HTML.tostring( elements[1] ) )
end
for _, el in ipairs(elements) do
print( html.tostring( el ) )
return 0
end
for _, el in ipairs(elements) do
logger.print( HTML.tostring(el) )
end