diff --git a/README.md b/README.md index 109e171..ee201f6 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,7 @@ Supported combinators are all the "basic" ones: ### TODO -- [ ] `--text` option to only get the text in the matched elements - [ ] Universal selector (`*` to match any element) -- [ ] Attribute selectors (`[attr="value"]`) - [ ] A way to "group" selectors, e.g. `aside {p, footer}` to select all `p`s and `footer`s in `aside`s ? ## Usage @@ -57,6 +55,8 @@ Where: * `-1`, `--first-only`: Return only the first match * `-e`, `--errors`: print warnings * `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/docs/Web/API/HTMLElement/innerText) of the matched elements +* `-t`, `--text`: Print only the [innerText](https://developer.mozilla.org/fr/ +* `-a`, `--select-attribute`: Print the value of the attribute on matched elements. Supersedes -t. ## Motivation diff --git a/html.lua b/html.lua index b9e185b..d1fad0a 100644 --- a/html.lua +++ b/html.lua @@ -159,7 +159,7 @@ function M.make_dom_element( tag_name, parent_elem ) end return text - end + end, } if parent_elem then @@ -227,7 +227,7 @@ function M.tokenise( content ) end if content:sub(i, i+1) == "", i) + i = content:find(">", i, true) goto continue end @@ -259,7 +259,7 @@ function M.tokenise( content ) if RAW_TEXT_TAGS[word] then logger.printerr("Warning: "..word.." tags may contain text that would be incorrectly parsed as HTML.") -- made possible because of the whitespace removal at the start - i = content:find("" - i = content:find(">", i) + i = content:find(">", i, true) -- made possible because of the whitespace removal at the start - i = content:find(" ") - logger.print(" html_path_or_minus: Path to HTML file or '-' for stdin") - logger.print(" css_selector: CSS selector to search for") - logger.print() - logger.print(" Flags:") - logger.print(" -1, --first-only: return only the first match") - logger.print(" -e, --errors: print warnings") - logger.print(" -t, --text: Print only the innerText of the matched elements") + logger.print("Usage: lua main.lua [FLAGS] ") + logger.print(" html_path_or_minus: Path to HTML file or '-' for stdin") + logger.print(" css_selector: CSS selector to search for") + logger.print() + logger.print(" Flags:") + logger.print(" -1, --first-only: return only the first match") + logger.print(" -e, --errors: print warnings") + logger.print(" -t, --text: Print only the innerText of the matched elements") + logger.print(" -a, --select-attribute: Print the value of the attribute on matched elements. Supersedes -t.") end @@ -59,18 +60,26 @@ local FLAGS = { FIRST_ONLY = {}, DO_PRINT_ERRORS = {}, INNER_TEXT = {}, + SELECT_ATTRIBUTE = {} } local LONGHAND_FLAGS = { ["first-only"] = FLAGS.FIRST_ONLY, ["errors"] = FLAGS.DO_PRINT_ERRORS, ["text"] = FLAGS.INNER_TEXT, + ["select-attribute"] = FLAGS.SELECT_ATTRIBUTE, } local SHORTHAND_FLAGS = { ["1"] = FLAGS.FIRST_ONLY, ["e"] = FLAGS.DO_PRINT_ERRORS, ["t"] = FLAGS.INNER_TEXT, + ["a"] = FLAGS.SELECT_ATTRIBUTE, +} + + +local FLAG_NEEDS_VALUE = { + [FLAGS.SELECT_ATTRIBUTE] = true, } @@ -84,44 +93,103 @@ end local flags = {} local positionals = {} -for _, argument in ipairs(arg) do - if argument:match("^%-%w+$") then - for letter in argument:sub(2):gmatch("(%w)") do - if not SHORTHAND_FLAGS[letter] then - logger.printerr("Unknown flag: -"..letter..".") - print_usage() - os.exit( RETURN_CODES.ARGUMENTS_ERROR ) - end +local i = 1 +while i <= #arg do + local argument = arg[i] + -- Handle shorthand flags (-a, -1, etc.) + if argument:match("^%-%w+$") then + local flag_str = argument:sub(2) + + -- Handle single-letter flags + if #flag_str == 1 then + local letter = flag_str local flag = SHORTHAND_FLAGS[letter] - if flags[flag] then - logger.printerr("Warning: passed -" .. letter .. " flag already !") + if not flag then + logger.printerr("Unknown flag: -"..letter) + print_usage() + os.exit(RETURN_CODES.ARGUMENTS_ERROR) end - flags[flag] = true - end - elseif argument:match("^%-%-[%w%-]+$") then - local flagname = argument:sub(3) - if not LONGHAND_FLAGS[flagname] then - logger.printerr("Unknown flag: --"..flagname..".") - print_usage() - os.exit( RETURN_CODES.ARGUMENTS_ERROR ) + -- Handle flags that require values + if FLAG_NEEDS_VALUE[flag] then + if i == #arg then + logger.printerr("Flag -"..letter.." requires a value") + os.exit(RETURN_CODES.ARGUMENTS_ERROR) + end + flags[flag] = arg[i+1] + i = i + 2 -- Skip next argument as it's the value + else + -- Handle regular boolean flags + if flags[flag] then + logger.printerr("Warning: passed -"..letter.." flag already!") + end + flags[flag] = true + i = i + 1 + end + + else + -- Handle grouped flags (-abc) + for letter in flag_str:gmatch("(%w)") do + local flag = SHORTHAND_FLAGS[letter] + + if not flag then + logger.printerr("Unknown flag in group: -"..letter) + print_usage() + os.exit(RETURN_CODES.ARGUMENTS_ERROR) + end + + if FLAG_NEEDS_VALUE[flag] then + logger.printerr("Cannot use value-taking flags in groups: -"..letter) + os.exit(RETURN_CODES.ARGUMENTS_ERROR) + end + + if flags[flag] then + logger.printerr("Warning: passed -"..letter.." flag already!") + end + flags[flag] = true + end + i = i + 1 end + -- Handle long flags (--flag) + elseif argument:match("^%-%-") then + local flagname = argument:sub(3) local flag = LONGHAND_FLAGS[flagname] - if flags[flag] then - logger.printerr("Warning: passed --" .. flagname .. " flag already !") + if not flag then + logger.printerr("Unknown flag: --"..flagname) + print_usage() + os.exit(RETURN_CODES.ARGUMENTS_ERROR) + end + + -- Handle flags that require values + if FLAG_NEEDS_VALUE[flag] then + if i == #arg then + logger.printerr("Flag --"..flagname.." requires a value") + os.exit(RETURN_CODES.ARGUMENTS_ERROR) + end + flags[flag] = arg[i+1] + i = i + 2 -- Skip next argument as it's the value + else + -- Handle regular boolean flags + if flags[flag] then + logger.printerr("Warning: passed --"..flagname.." flag already!") + end + flags[flag] = true + i = i + 1 end - flags[flag] = true else - table.insert( positionals, argument ) + -- Handle positional arguments + table.insert(positionals, argument) + i = i + 1 end end + if flags[ FLAGS.DO_PRINT_ERRORS ] then logger.enable_printing_errors() end @@ -201,8 +269,8 @@ while current_selector.combinator ~= nil do for _, element in ipairs( elements ) do local next_sibling = element:get_next_sibling() while next_sibling and next_sibling.tag_name == ":text" do - next_sibling = next_sibling:get_next_sibling() - end + next_sibling = next_sibling:get_next_sibling() + end if next_sibling and next_sibling:check_simple_selector( next_selector.selector ) then table.insert( new_elements, next_sibling ) @@ -238,24 +306,51 @@ if #elements == 0 then os.exit( RETURN_CODES.NOTHING_FOUND ) end - +local MAX_NUMBER_OF_ELEMENTS_TO_SHOW = #elements if flags[FLAGS.FIRST_ONLY] then - if #elements > 0 then - if flags[FLAGS.INNER_TEXT] then - logger.print( elements[1]:inner_text() ) - os.exit( RETURN_CODES.OK ) - end + MAX_NUMBER_OF_ELEMENTS_TO_SHOW = 1 +end - logger.print( HTML.tostring( elements[1] ) ) + + + + +local attr = flags[FLAGS.SELECT_ATTRIBUTE] +if attr then + local spoof_nil = {} + local attrs = {} + + local i = 1 + while i <= MAX_NUMBER_OF_ELEMENTS_TO_SHOW do + local el = elements[i] + + local attribute_value = el.attributes[attr] + + table.insert( attrs, attribute_value or spoof_nil ) + + i = i+1 + end + + local nb_non_nil_values = 0 + for _, val in ipairs(attrs) do + if val ~= spoof_nil then + nb_non_nil_values = nb_non_nil_values + 1 + end + end + + if nb_non_nil_values == 0 then + os.exit( RETURN_CODES.NOTHING_FOUND ) + end + + for _, val in ipairs(attrs) do + if val ~= spoof_nil then + print(val) + else + print() + end end os.exit( RETURN_CODES.OK ) end -for _, el in ipairs(elements) do - if flags[FLAGS.INNER_TEXT] then - logger.print( el:inner_text() ) - else - logger.print( HTML.tostring(el) ) - end end