Compare commits
	
		
			No commits in common. "b94a21cb2edd6a3b5bf323e2ece748b822798932" and "763484013c13518372fc79d1fedcd638601fd790" have entirely different histories. 
		
	
	
		
			b94a21cb2e
			...
			763484013c
		
	
		| 
						 | 
				
			
			@ -2,5 +2,3 @@
 | 
			
		|||
*.luastatic.c
 | 
			
		||||
# Compiled executable from main.lua
 | 
			
		||||
main
 | 
			
		||||
# Same but with correct name
 | 
			
		||||
htmlq
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										41
									
								
								README.md
								
								
								
								
							
							
						
						
									
										41
									
								
								README.md
								
								
								
								
							| 
						 | 
				
			
			@ -41,23 +41,16 @@ Supported combinators are all the "basic" ones:
 | 
			
		|||
 | 
			
		||||
## Usage
 | 
			
		||||
 | 
			
		||||
Once compiled, you can run Htmlq using the following command:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
 | 
			
		||||
Usage: lua main.lua [FLAGS] <html_path_or_minus> <css_selector>
 | 
			
		||||
  html_path_or_minus: Path to HTML file or '-' for stdin
 | 
			
		||||
  css_selector: CSS selector to search for
 | 
			
		||||
 | 
			
		||||
  Flags:
 | 
			
		||||
  -f, --first-only: return only the first match
 | 
			
		||||
  -q, --quiet: Don't print warnings
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Where:
 | 
			
		||||
 | 
			
		||||
*   `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
 | 
			
		||||
*   `<css_selector>` is the CSS selector you want to use to query the HTML.
 | 
			
		||||
 | 
			
		||||
### Flags
 | 
			
		||||
 | 
			
		||||
*   `-1`, `--first-only`: Return only the first match
 | 
			
		||||
*   `-q`, `--quiet`: Don't print warnings
 | 
			
		||||
*   `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/docs/Web/API/HTMLElement/innerText) of the matched elements
 | 
			
		||||
 | 
			
		||||
## Motivation
 | 
			
		||||
 | 
			
		||||
I needed this for a specific need of mine, where I wanted to systematically extract the HTML starting with an element with a certain id, up to the closing tag. While I could probably have hacked something together for this one-time use case, in typical programmer spirit, I decided to create a tool.
 | 
			
		||||
| 
						 | 
				
			
			@ -85,7 +78,25 @@ luarocks install luastatic
 | 
			
		|||
Once `luastatic` is installed, you can compile Htmlq by running the following command in your terminal, from the project's root directory:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so -o htmlq
 | 
			
		||||
luastatic main.lua css.lua html.lua logging.lua /usr/lib/liblua5.4.so
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Note that all `.lua` files from the project need to be specified, with `main.lua` as the first one. Also, the path to `liblua` may vary according to your system. The example provided is for an installation on EndeavourOS.
 | 
			
		||||
 | 
			
		||||
## Running
 | 
			
		||||
 | 
			
		||||
Once compiled, you can run Htmlq using the following command:
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
./htmlq [FLAGS] <html_path_or_minus> <css_selector>
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Where:
 | 
			
		||||
 | 
			
		||||
*   `<html_path_or_minus>` is the path to the HTML file you want to parse, or `-` to read from stdin.
 | 
			
		||||
*   `<css_selector>` is the CSS selector you want to use to query the HTML.
 | 
			
		||||
 | 
			
		||||
### Flags
 | 
			
		||||
 | 
			
		||||
*   `-f`, `--first-only`: Return only the first match
 | 
			
		||||
*   `-q`, `--quiet`: Don't print warnings
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										34
									
								
								css.lua
								
								
								
								
							
							
						
						
									
										34
									
								
								css.lua
								
								
								
								
							| 
						 | 
				
			
			@ -66,8 +66,7 @@ local function parse_compound_selector( tokeniser )
 | 
			
		|||
		tag_name = nil,
 | 
			
		||||
		id = nil,
 | 
			
		||||
		class = {},
 | 
			
		||||
		attributes_values = {},
 | 
			
		||||
		attributes_present = {},
 | 
			
		||||
		attributes = {},
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	--local selectors = {}
 | 
			
		||||
| 
						 | 
				
			
			@ -107,37 +106,6 @@ local function parse_compound_selector( tokeniser )
 | 
			
		|||
			end
 | 
			
		||||
			--table.insert(selectors, {type = "id", value = name})
 | 
			
		||||
			selector.id = name
 | 
			
		||||
		elseif char == "[" then
 | 
			
		||||
			tokeniser.next() -- consume leading [
 | 
			
		||||
 | 
			
		||||
			local name = tokeniser.read_identifier()
 | 
			
		||||
 | 
			
		||||
			if tokeniser.peek() == "=" then
 | 
			
		||||
				tokeniser.next()
 | 
			
		||||
 | 
			
		||||
				if tokeniser.peek() ~= "\"" then
 | 
			
		||||
					error("Expected opening quote \" at pos " .. tokeniser.pos() )
 | 
			
		||||
				end
 | 
			
		||||
				tokeniser.next() -- consume leading "
 | 
			
		||||
 | 
			
		||||
				local value = ""
 | 
			
		||||
				while tokeniser.peek() ~= "\"" do
 | 
			
		||||
					value = value .. tokeniser.peek()
 | 
			
		||||
					tokeniser.next()
 | 
			
		||||
				end
 | 
			
		||||
 | 
			
		||||
				tokeniser.next() -- consume trailing "
 | 
			
		||||
 | 
			
		||||
				selector.attributes_values[name] = value
 | 
			
		||||
			else
 | 
			
		||||
				table.insert( selector.attributes_present, name )
 | 
			
		||||
			end
 | 
			
		||||
 | 
			
		||||
			if tokeniser.peek() ~= "]" then
 | 
			
		||||
				error("Expected closing bracket (']') at " .. tokeniser.pos())
 | 
			
		||||
			end
 | 
			
		||||
 | 
			
		||||
			tokeniser.next() -- consume trailing ]
 | 
			
		||||
		else
 | 
			
		||||
			break
 | 
			
		||||
		end
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										89
									
								
								html.lua
								
								
								
								
							
							
						
						
									
										89
									
								
								html.lua
								
								
								
								
							| 
						 | 
				
			
			@ -39,62 +39,6 @@ local VOID_TAGS = {
 | 
			
		|||
	wbr = true,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
local INLINE_TAGS = {
 | 
			
		||||
	-- Text formatting
 | 
			
		||||
	a = true,
 | 
			
		||||
	abbr = true,
 | 
			
		||||
	b = true,
 | 
			
		||||
	bdi = true,
 | 
			
		||||
	bdo = true,
 | 
			
		||||
	cite = true,
 | 
			
		||||
	code = true,
 | 
			
		||||
	data = true,
 | 
			
		||||
	dfn = true,
 | 
			
		||||
	em = true,
 | 
			
		||||
	i = true,
 | 
			
		||||
	kbd = true,
 | 
			
		||||
	mark = true,
 | 
			
		||||
	q = true,
 | 
			
		||||
	ruby = true,
 | 
			
		||||
	s = true,
 | 
			
		||||
	samp = true,
 | 
			
		||||
	small = true,
 | 
			
		||||
	span = true,
 | 
			
		||||
	strong = true,
 | 
			
		||||
	sub = true,
 | 
			
		||||
	sup = true,
 | 
			
		||||
	time = true,
 | 
			
		||||
	u = true,
 | 
			
		||||
	var = true,
 | 
			
		||||
 | 
			
		||||
	-- Interactive elements
 | 
			
		||||
	button = true,
 | 
			
		||||
	label = true,
 | 
			
		||||
	select = true,
 | 
			
		||||
	textarea = true,
 | 
			
		||||
 | 
			
		||||
	-- Media/content
 | 
			
		||||
	img = true,
 | 
			
		||||
	picture = true,
 | 
			
		||||
	map = true,
 | 
			
		||||
	object = true,
 | 
			
		||||
 | 
			
		||||
	-- Line break
 | 
			
		||||
	br = true,
 | 
			
		||||
	wbr = true,
 | 
			
		||||
 | 
			
		||||
	-- Forms
 | 
			
		||||
	input = true,
 | 
			
		||||
	output = true,
 | 
			
		||||
	progress = true,
 | 
			
		||||
	meter = true,
 | 
			
		||||
 | 
			
		||||
	-- Scripting
 | 
			
		||||
	script = true,
 | 
			
		||||
	noscript = true,
 | 
			
		||||
	template = true,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
function M.make_dom_element( tag_name, parent_elem )
 | 
			
		||||
	local o = {
 | 
			
		||||
| 
						 | 
				
			
			@ -142,23 +86,6 @@ function M.make_dom_element( tag_name, parent_elem )
 | 
			
		|||
			for _, child in ipairs(self.children or {}) do
 | 
			
		||||
				child:foreach( fn )
 | 
			
		||||
			end
 | 
			
		||||
		end,
 | 
			
		||||
 | 
			
		||||
		inner_text = function(self)
 | 
			
		||||
			if self.tag_name == ":text" then
 | 
			
		||||
				return self.content
 | 
			
		||||
			end
 | 
			
		||||
 | 
			
		||||
			local text = ""
 | 
			
		||||
			for _, child in ipairs(self.children) do
 | 
			
		||||
				text = text .. child:inner_text()
 | 
			
		||||
 | 
			
		||||
				if not INLINE_TAGS[child.tag_name] then
 | 
			
		||||
					text = text .. "\n"
 | 
			
		||||
				end
 | 
			
		||||
			end
 | 
			
		||||
 | 
			
		||||
			return text
 | 
			
		||||
		end
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -422,20 +349,6 @@ function M.check_simple_selector(element, selector)
 | 
			
		|||
			end
 | 
			
		||||
		end
 | 
			
		||||
 | 
			
		||||
	for attr_name, attr_value in pairs(selector.attributes_values) do
 | 
			
		||||
		local elem_attr_value = element.attributes[attr_name]
 | 
			
		||||
		if elem_attr_value ~= attr_value then
 | 
			
		||||
			return false
 | 
			
		||||
		end
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
	-- Check attribute presence selectors
 | 
			
		||||
	for _, attr_name in ipairs(selector.attributes_present) do
 | 
			
		||||
		if not element.attributes[attr_name] then
 | 
			
		||||
			return false
 | 
			
		||||
		end
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
		return true
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -611,7 +524,7 @@ function M.clean_text_nodes(node)
 | 
			
		|||
		return
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
	node.content = node.content:gsub("%s+", " ")
 | 
			
		||||
	node.content = trim( node.content:gsub("%s+", " ") )
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										28
									
								
								main.lua
								
								
								
								
							
							
						
						
									
										28
									
								
								main.lua
								
								
								
								
							| 
						 | 
				
			
			@ -21,9 +21,8 @@ local function print_usage()
 | 
			
		|||
    logger.print("  css_selector: CSS selector to search for")
 | 
			
		||||
		logger.print()
 | 
			
		||||
		logger.print("  Flags:")
 | 
			
		||||
    logger.print("  -1, --first-only: return only the first match")
 | 
			
		||||
    logger.print("  -f, --first-only: return only the first match")
 | 
			
		||||
    logger.print("  -q, --quiet: Don't print warnings")
 | 
			
		||||
    logger.print("  -t, --text: Print only the innerText of the matched elements")
 | 
			
		||||
    os.exit(1)
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -33,20 +32,17 @@ end
 | 
			
		|||
 | 
			
		||||
local FLAGS = {
 | 
			
		||||
	FIRST_ONLY = {},
 | 
			
		||||
	DO_PRINT_ERRORS = {},
 | 
			
		||||
	INNER_TEXT = {},
 | 
			
		||||
	NO_PRINT_ERRORS = {},
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
local LONGHAND_FLAGS = {
 | 
			
		||||
	["first-only"] = FLAGS.FIRST_ONLY,
 | 
			
		||||
	["errors"] = FLAGS.DO_PRINT_ERRORS,
 | 
			
		||||
	["text"] = FLAGS.INNER_TEXT,
 | 
			
		||||
	["quiet"] = FLAGS.NO_PRINT_ERRORS
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
local SHORTHAND_FLAGS = {
 | 
			
		||||
	["1"] = FLAGS.FIRST_ONLY,
 | 
			
		||||
	["e"] = FLAGS.DO_PRINT_ERRORS,
 | 
			
		||||
	["t"] = FLAGS.INNER_TEXT,
 | 
			
		||||
	["f"] = FLAGS.FIRST_ONLY,
 | 
			
		||||
	["q"] = FLAGS.NO_PRINT_ERRORS,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -98,7 +94,7 @@ for _, argument in ipairs(arg) do
 | 
			
		|||
end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if flags[ FLAGS.DO_PRINT_ERRORS ] then
 | 
			
		||||
if not flags[ FLAGS.NO_PRINT_ERRORS ] then
 | 
			
		||||
	logger.enable_printing_errors()
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -126,7 +122,7 @@ if html_file ~= "-" then
 | 
			
		|||
 | 
			
		||||
	html = handle:read("a")
 | 
			
		||||
else
 | 
			
		||||
	html = io.read("a")
 | 
			
		||||
	html = io.read()
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
local document = HTML.parse( html )
 | 
			
		||||
| 
						 | 
				
			
			@ -214,12 +210,6 @@ end
 | 
			
		|||
 | 
			
		||||
if flags[FLAGS.FIRST_ONLY] then
 | 
			
		||||
	if #elements > 0 then
 | 
			
		||||
 | 
			
		||||
		if flags[FLAGS.INNER_TEXT] then
 | 
			
		||||
			logger.print( elements[1]:inner_text() )
 | 
			
		||||
			return 0
 | 
			
		||||
		end
 | 
			
		||||
 | 
			
		||||
		logger.print( HTML.tostring( elements[1] ) )
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -227,9 +217,5 @@ if flags[FLAGS.FIRST_ONLY] then
 | 
			
		|||
end
 | 
			
		||||
 | 
			
		||||
for _, el in ipairs(elements) do
 | 
			
		||||
		if flags[FLAGS.INNER_TEXT] then
 | 
			
		||||
			logger.print( el:inner_text() )
 | 
			
		||||
		else
 | 
			
		||||
		logger.print( HTML.tostring(el) )
 | 
			
		||||
end
 | 
			
		||||
end
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue