From 7285f020c0df280da8fa7d8658f987fa9be2e051 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Wed, 21 Sep 2022 23:41:27 +0200 Subject: [PATCH 1/4] feat(scraper): implement Module_Info scraper --- src/awesomewm.d.tl/entities/Module_Info.lua | 10 +++++++ .../scraper/module_info_list.lua | 28 +++++++++++++++++++ src/awesomewm.d.tl/scraper/utils.lua | 25 +++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 src/awesomewm.d.tl/entities/Module_Info.lua create mode 100644 src/awesomewm.d.tl/scraper/module_info_list.lua create mode 100644 src/awesomewm.d.tl/scraper/utils.lua diff --git a/src/awesomewm.d.tl/entities/Module_Info.lua b/src/awesomewm.d.tl/entities/Module_Info.lua new file mode 100644 index 0000000..df163d9 --- /dev/null +++ b/src/awesomewm.d.tl/entities/Module_Info.lua @@ -0,0 +1,10 @@ +local class = require "pl.class" + +local Module_Info = class.Module_Info() + +function Module_Info:_init(name, uri) + self.name = name + self.uri = uri +end + +return Module_Info diff --git a/src/awesomewm.d.tl/scraper/module_info_list.lua b/src/awesomewm.d.tl/scraper/module_info_list.lua new file mode 100644 index 0000000..f2a8b5f --- /dev/null +++ b/src/awesomewm.d.tl/scraper/module_info_list.lua @@ -0,0 +1,28 @@ +local Module_Info = require "entities.Module_Info" +local scraper_utils = require "scraper.utils" +local utils = require "utils" + +local module = {} + +local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a" + +local function extract_module_info(node) + local name = utils.sanitize_string(node:inner_text()) + local uri = node.attr.href + + if not (name and uri) then + error("Can't extract module info from node: " .. node:outer_html()) + end + + return Module_Info(name, uri) +end + +function module.get_modules_from_index(html) + return scraper_utils.scrape( + html, + MODULE_A_TAG_QUERY_SELECTOR, + extract_module_info + ) +end + +return module diff --git a/src/awesomewm.d.tl/scraper/utils.lua b/src/awesomewm.d.tl/scraper/utils.lua new file mode 100644 index 0000000..04280ef --- /dev/null +++ b/src/awesomewm.d.tl/scraper/utils.lua @@ -0,0 +1,25 @@ +local log = require "logger" +local scanner = require "web_sanitize.query.scan_html" + +local scraper_utils = {} + +function scraper_utils.scrape(html, query_selector, extract_callback) + local ret = {} + + scanner.scan_html(html, function(stack) + if stack:is(query_selector) then + local node = stack:current() + local success, info = pcall(extract_callback, node) + + if not success then + log:error { message = info } + else + table.insert(ret, info) + end + end + end) + + return ret +end + +return scraper_utils -- 2.40.1 From ff446d3beabf79dcd97d8ded1684c39948e0e938 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Thu, 29 Sep 2022 19:01:00 +0200 Subject: [PATCH 2/4] feat(scraper): implement basic Module_Doc scraper --- src/awesomewm.d.tl/entities/Function_Info.lua | 23 +++++ src/awesomewm.d.tl/entities/Module_Doc.lua | 12 +++ src/awesomewm.d.tl/scraper/module_doc.lua | 84 +++++++++++++++++++ src/awesomewm.d.tl/scraper/utils.lua | 43 ++++++++++ 4 files changed, 162 insertions(+) create mode 100644 src/awesomewm.d.tl/entities/Function_Info.lua create mode 100644 src/awesomewm.d.tl/entities/Module_Doc.lua create mode 100644 src/awesomewm.d.tl/scraper/module_doc.lua diff --git a/src/awesomewm.d.tl/entities/Function_Info.lua b/src/awesomewm.d.tl/entities/Function_Info.lua new file mode 100644 index 0000000..956ac11 --- /dev/null +++ b/src/awesomewm.d.tl/entities/Function_Info.lua @@ -0,0 +1,23 @@ +local class = require "pl.class" +local List = require "pl.List" + +local Function_Info = class.Module_Doc() + +function Function_Info:_init() + self.name = "" + self.parameters = List() + self.return_types = List() +end + +function Function_Info:append_return_type(return_type) + self.return_types:append(return_type) +end + +function Function_Info:append_parameter(name, type) + self.parameters:append { + name = name, + type = type, + } +end + +return Function_Info diff --git a/src/awesomewm.d.tl/entities/Module_Doc.lua b/src/awesomewm.d.tl/entities/Module_Doc.lua new file mode 100644 index 0000000..48ddc31 --- /dev/null +++ b/src/awesomewm.d.tl/entities/Module_Doc.lua @@ -0,0 +1,12 @@ +local class = require "pl.class" +local List = require "pl.List" + +local Module_Doc = class.Module_Doc() + +function Module_Doc:_init() + self.constructors = List() + self.methods = List() + self.static_functions = List() +end + +return Module_Doc diff --git a/src/awesomewm.d.tl/scraper/module_doc.lua b/src/awesomewm.d.tl/scraper/module_doc.lua new file mode 100644 index 0000000..be2efb6 --- /dev/null +++ b/src/awesomewm.d.tl/scraper/module_doc.lua @@ -0,0 +1,84 @@ +local Function_Info = require "entities.Function_Info" +local Module_Doc = require "entities.Module_Doc" +local scraper_utils = require "scraper.utils" +local utils = require "utils" + +local function extract_function_name(function_name_node) + return function_name_node and (function_name_node.attr.name:gsub(".*:", "")) +end + +local function extract_function_return_types(function_return_types_node) + if not function_return_types_node then + return {} + end + + local selector = "span.types .type" + local html = function_return_types_node:outer_html() + + return scraper_utils.scrape(html, selector, function(node) + return utils.sanitize_string(node:inner_text()) + end) +end + +local function extract_section_functions(dl) + local query_selectors = { + function_name = "dt a", + function_return_type = "dd ol", + } + + return scraper_utils.scrape_tuples( + dl, + { query_selectors.function_name, query_selectors.function_return_type }, + function(nodes) + local function_info = Function_Info() + + function_info.name = + extract_function_name(nodes[query_selectors.function_name]) + function_info.return_types = extract_function_return_types( + nodes[query_selectors.function_return_type] + ) + + return function_info + end + ) +end + +local module = {} + +function module.get_doc_from_page(html) + local nodes = scraper_utils.extract_nodes(html, { + "h2.section-header", + "dl.function", + }) + + if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then + error "The list aren't the same size!" + end + + local module_doc = Module_Doc() + + for i, h2 in ipairs(nodes:get "h2.section-header") do + local section_name = utils.sanitize_string(h2:inner_text()) + local dl_html = nodes:get("dl.function")[i]:outer_html() + + if section_name == "Constructors" then + module_doc.constructors = extract_section_functions(dl_html) + elseif section_name == "Static module functions" then + module_doc.static_functions = extract_section_functions(dl_html) + elseif section_name == "Object properties" then + print "Not implemented: Deprecated object properties" + elseif section_name == "Deprecated object properties" then + print "Not implemented: Deprecated object properties" + elseif section_name == "Object methods" then + module_doc.methods = extract_section_functions(dl_html) + elseif section_name == "Signals" then + print "Not implemented: Signals" + else + error("Unknown section name: " .. section_name) + end + end + + return module_doc +end + +return module diff --git a/src/awesomewm.d.tl/scraper/utils.lua b/src/awesomewm.d.tl/scraper/utils.lua index 04280ef..4668c0c 100644 --- a/src/awesomewm.d.tl/scraper/utils.lua +++ b/src/awesomewm.d.tl/scraper/utils.lua @@ -1,5 +1,8 @@ +local List = require "pl.List" local log = require "logger" +local Map = require "pl.Map" local scanner = require "web_sanitize.query.scan_html" +local tablex = require "pl.tablex" local scraper_utils = {} @@ -22,4 +25,44 @@ function scraper_utils.scrape(html, query_selector, extract_callback) return ret end +function scraper_utils.extract_nodes(html, query_selectors) + local siblings = Map() + + tablex.foreach(query_selectors, function(query_selector) + siblings:set(query_selector, List()) + end) + + scanner.scan_html(html, function(stack) + tablex.foreach(query_selectors, function(query_selector) + if stack:is(query_selector) then + siblings:get(query_selector):append(stack:current()) + end + end) + end) + + return siblings +end + +function scraper_utils.scrape_tuples(html, query_selectors, extract_callback) + local nodes = scraper_utils.extract_nodes(html, query_selectors) + + local ret = {} + + for i = 1, #nodes:get(query_selectors[1]) do + local node_list = {} + tablex.foreach(query_selectors, function(query_selector) + node_list[query_selector] = nodes:get(query_selector)[i] or nil + end) + local success, info = pcall(extract_callback, node_list) + + if not success then + log:error { message = info } + else + table.insert(ret, info) + end + end + + return ret +end + return scraper_utils -- 2.40.1 From ebe1fdaa8d637a434d24d64d8a514927e496a05c Mon Sep 17 00:00:00 2001 From: Aire-One Date: Thu, 29 Sep 2022 19:13:58 +0200 Subject: [PATCH 3/4] feat(scraper): use the new scraper --- src/awesomewm.d.tl/init.lua | 31 ++-- src/awesomewm.d.tl/scraper/init.lua | 217 +--------------------------- 2 files changed, 22 insertions(+), 226 deletions(-) diff --git a/src/awesomewm.d.tl/init.lua b/src/awesomewm.d.tl/init.lua index decd5bb..686c1ee 100644 --- a/src/awesomewm.d.tl/init.lua +++ b/src/awesomewm.d.tl/init.lua @@ -3,19 +3,19 @@ local inspect = require "inspect" local log = require "logger" local properties = require "properties" local scraper = require "scraper" -local generator = require "generator" +-- local generator = require "generator" log:info( inspect { message = "Start extraction", base_url = properties.base_url } ) --- local index = crawler.fetch(properties.base_url .. properties.index_uri) --- local modules = scraper.get_modules_from_index( --- index, --- properties.ignored_modules --- ) +local index = crawler.fetch(properties.base_url .. properties.index_uri) --- log:info(inspect { modules_found = #modules }) +-- local modules = +-- scraper.get_modules_from_index(index, properties.ignored_modules) +local module_infos = scraper.module_info_list.get_modules_from_index(index) + +log:info(inspect { modules_found = #module_infos }) -- for i = 1, 1 do -- #modules do -- local m = modules[i] @@ -25,12 +25,15 @@ log:info( -- log:info(inspect { items }) -- end -local page = - crawler.fetch(properties.base_url .. "/widgets/wibox.widget.imagebox.html") -local items = scraper.get_doc_from_page(page) +local html = + crawler.fetch(properties.base_url .. "/widgets/wibox.widget.textbox.html") +local module_doc = scraper.module_doc.get_doc_from_page(html) +log:info(inspect { module_doc = module_doc }) + +-- local items = scraper.get_doc_from_page(page) -- log:info(inspect { items }) -generator.write( - generator.generate_teal(items), - properties.out_directory .. "/test.tl" -) +-- generator.write( +-- generator.generate_teal(items), +-- properties.out_directory .. "/test.tl" +-- ) diff --git a/src/awesomewm.d.tl/scraper/init.lua b/src/awesomewm.d.tl/scraper/init.lua index 6aa1cb2..77155e4 100644 --- a/src/awesomewm.d.tl/scraper/init.lua +++ b/src/awesomewm.d.tl/scraper/init.lua @@ -1,212 +1,5 @@ -local htmlparser = require "htmlparser" -local log = require "logger" -local utils = require "utils" - -local scraper = {} - -function scraper.get_modules_from_index(html, ignored) - local document = htmlparser.parse(html) - - local modules = utils.map(document "#navigation ul > li a", function(node) - return { - name = utils.sanitize_string(node:getcontent()), - uri = node.attributes.href, - } - end) - - local filtered_modules = utils.filter(modules, function(module) - return not utils.has_item(ignored, module.name) - end) - - return filtered_modules -end - -local function extract_first_node_or_fail(node, selector) - local extracted = node(selector)[1] - - if not extracted then - log:error { - message = "Can't find `" .. selector .. "` element!", - node = node:gettext(), - } - error "extract_first_node_or_fail" - end - - return extracted -end - -local function extract_first_node(node, selector) - local extracted = node(selector)[1] - - if not extracted then - return nil - end - - return extracted -end - -local function extract_inner_floating_text(node) - local html = node:getcontent() - - -- Remove inner tags from the html - for _, n in ipairs(node.nodes) do - html = utils.replace(html, n:gettext(), "") - end - - return utils.sanitize_string(html) -end - -local function extract_item_content_parameter_list(content) - for i, n in ipairs(content.nodes) do - -- The parameters
    is the next element after one of these

    - if - n.name == "h3" - and utils.has_item( - { "Parameters:", "Type constraints:", "Arguments" }, - utils.sanitize_string(n:getcontent()) - ) - then - return content.nodes[i + 1] - end - end - - return nil -end - -local function process_item_header_type(node) - local types_node = extract_first_node(node, "span.summary_type") - - if not types_node then - return nil - end - - local types_string = string.match(types_node:getcontent(), "%((.-)%)") - or types_node:getcontent() - types_string = utils.replace(types_string, "or", ",") - local splitted = string.gmatch(types_string, "([^,]+)") - - local types = {} - for t in splitted do - table.insert(types, utils.sanitize_string(t)) - end - - return types -end - -local function process_item_header(node) - local name_node = extract_first_node_or_fail(node, "strong") - - -- Remove starting ":" character on method name - local name = extract_inner_floating_text(name_node):gsub("^:", "") - local type = process_item_header_type(node) - - local item_header = { - name = name, - type = type, - } - - return item_header -end - -local function process_item_content_parameters(node) - local parameter = {} - - local name = extract_first_node_or_fail(node, "span.parameter") - parameter.name = utils.sanitize_string(name:getcontent()) - - local nested_parameters_list = extract_first_node(node, "ul") - if nested_parameters_list then - parameter.type = "table" - parameter.nested_parameters = utils.map( - nested_parameters_list "* > li", - process_item_content_parameters - ) - - return parameter - end - - local types_list = utils.map(node ".type", function(type) - return utils.sanitize_string(type:getcontent()) - end) - if #types_list > 0 then - parameter.type = types_list - end - - return parameter -end - -local function process_item_content(content) - local parameters_list = extract_item_content_parameter_list(content) - local parameters = parameters_list - and utils.map( - parameters_list "* > li", - process_item_content_parameters - ) - or {} - - local item_content = { - parameters = #parameters > 0 and parameters or nil, - } - - return item_content -end - -local function process_section_titles(document) - return utils.map(document "h2.section-header", extract_inner_floating_text) -end - -local function process_section_items(item) - local headers = utils.map(item "dt", process_item_header) - local contents = utils.map(item "dd", process_item_content) - - if #headers ~= #contents then - log:error { - message = "extract_item_content failure: headers and contents don't have the same size", - headers = #headers, - contents = #contents, - } - error "extract_item_content" - end - - local item_contents = utils.map(headers, function(header, i) - return { - name = header.name, - type = header.type, - parameters = contents[i].parameters, - } - end) - - return item_contents -end - -local function process_section_contents(document) - local section_items = document "dl.function" - - local section_contents = utils.map(section_items, process_section_items) - - return section_contents -end - -function scraper.get_doc_from_page(html) - local document = htmlparser.parse(html, 9999) - - local sections_titles = process_section_titles(document) - local section_contents = process_section_contents(document) - - if #sections_titles ~= #section_contents then - log:error { - message = "get_items_from_page failure: section_titles and section_contents don't have the same size", - sections_titles = #sections_titles, - func = #section_contents, - } - error "get_items_from_page" - end - - local doc = utils.map(sections_titles, function(title, i) - return { section = title, items = section_contents[i] } - end) - - return doc -end - -return scraper +return { + module_doc = require "scraper.module_doc", + module_info_list = require "scraper.module_info_list", + utils = require "scraper.utils", +} -- 2.40.1 From dc89b182731cd98aadfcf9c810c69274c1915706 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Thu, 29 Sep 2022 19:17:20 +0200 Subject: [PATCH 4/4] feat(rockspec): remove htmlparser dependency --- rockspecs/awesomewm.d.tl-dev-1.rockspec | 1 - 1 file changed, 1 deletion(-) diff --git a/rockspecs/awesomewm.d.tl-dev-1.rockspec b/rockspecs/awesomewm.d.tl-dev-1.rockspec index d91ae98..cc2af55 100644 --- a/rockspecs/awesomewm.d.tl-dev-1.rockspec +++ b/rockspecs/awesomewm.d.tl-dev-1.rockspec @@ -12,7 +12,6 @@ dependencies = { "lualogging 1.6.0", "inspect 3.1.3", "ansicolors 1.0.2", - "htmlparser 0.3.9", "web_sanitize 1.3.0", "penlight 1.13.1", "luasocket 3.1.0-1", -- 2.40.1