From ff446d3beabf79dcd97d8ded1684c39948e0e938 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Thu, 29 Sep 2022 19:01:00 +0200 Subject: [PATCH] feat(scraper): implement basic Module_Doc scraper --- src/awesomewm.d.tl/entities/Function_Info.lua | 23 +++++ src/awesomewm.d.tl/entities/Module_Doc.lua | 12 +++ src/awesomewm.d.tl/scraper/module_doc.lua | 84 +++++++++++++++++++ src/awesomewm.d.tl/scraper/utils.lua | 43 ++++++++++ 4 files changed, 162 insertions(+) create mode 100644 src/awesomewm.d.tl/entities/Function_Info.lua create mode 100644 src/awesomewm.d.tl/entities/Module_Doc.lua create mode 100644 src/awesomewm.d.tl/scraper/module_doc.lua diff --git a/src/awesomewm.d.tl/entities/Function_Info.lua b/src/awesomewm.d.tl/entities/Function_Info.lua new file mode 100644 index 0000000..956ac11 --- /dev/null +++ b/src/awesomewm.d.tl/entities/Function_Info.lua @@ -0,0 +1,23 @@ +local class = require "pl.class" +local List = require "pl.List" + +local Function_Info = class.Module_Doc() + +function Function_Info:_init() + self.name = "" + self.parameters = List() + self.return_types = List() +end + +function Function_Info:append_return_type(return_type) + self.return_types:append(return_type) +end + +function Function_Info:append_parameter(name, type) + self.parameters:append { + name = name, + type = type, + } +end + +return Function_Info diff --git a/src/awesomewm.d.tl/entities/Module_Doc.lua b/src/awesomewm.d.tl/entities/Module_Doc.lua new file mode 100644 index 0000000..48ddc31 --- /dev/null +++ b/src/awesomewm.d.tl/entities/Module_Doc.lua @@ -0,0 +1,12 @@ +local class = require "pl.class" +local List = require "pl.List" + +local Module_Doc = class.Module_Doc() + +function Module_Doc:_init() + self.constructors = List() + self.methods = List() + self.static_functions = List() +end + +return Module_Doc diff --git a/src/awesomewm.d.tl/scraper/module_doc.lua b/src/awesomewm.d.tl/scraper/module_doc.lua new file mode 100644 index 0000000..be2efb6 --- /dev/null +++ b/src/awesomewm.d.tl/scraper/module_doc.lua @@ -0,0 +1,84 @@ +local Function_Info = require "entities.Function_Info" +local Module_Doc = require "entities.Module_Doc" +local scraper_utils = require "scraper.utils" +local utils = require "utils" + +local function extract_function_name(function_name_node) + return function_name_node and (function_name_node.attr.name:gsub(".*:", "")) +end + +local function extract_function_return_types(function_return_types_node) + if not function_return_types_node then + return {} + end + + local selector = "span.types .type" + local html = function_return_types_node:outer_html() + + return scraper_utils.scrape(html, selector, function(node) + return utils.sanitize_string(node:inner_text()) + end) +end + +local function extract_section_functions(dl) + local query_selectors = { + function_name = "dt a", + function_return_type = "dd ol", + } + + return scraper_utils.scrape_tuples( + dl, + { query_selectors.function_name, query_selectors.function_return_type }, + function(nodes) + local function_info = Function_Info() + + function_info.name = + extract_function_name(nodes[query_selectors.function_name]) + function_info.return_types = extract_function_return_types( + nodes[query_selectors.function_return_type] + ) + + return function_info + end + ) +end + +local module = {} + +function module.get_doc_from_page(html) + local nodes = scraper_utils.extract_nodes(html, { + "h2.section-header", + "dl.function", + }) + + if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then + error "The list aren't the same size!" + end + + local module_doc = Module_Doc() + + for i, h2 in ipairs(nodes:get "h2.section-header") do + local section_name = utils.sanitize_string(h2:inner_text()) + local dl_html = nodes:get("dl.function")[i]:outer_html() + + if section_name == "Constructors" then + module_doc.constructors = extract_section_functions(dl_html) + elseif section_name == "Static module functions" then + module_doc.static_functions = extract_section_functions(dl_html) + elseif section_name == "Object properties" then + print "Not implemented: Deprecated object properties" + elseif section_name == "Deprecated object properties" then + print "Not implemented: Deprecated object properties" + elseif section_name == "Object methods" then + module_doc.methods = extract_section_functions(dl_html) + elseif section_name == "Signals" then + print "Not implemented: Signals" + else + error("Unknown section name: " .. section_name) + end + end + + return module_doc +end + +return module diff --git a/src/awesomewm.d.tl/scraper/utils.lua b/src/awesomewm.d.tl/scraper/utils.lua index 04280ef..4668c0c 100644 --- a/src/awesomewm.d.tl/scraper/utils.lua +++ b/src/awesomewm.d.tl/scraper/utils.lua @@ -1,5 +1,8 @@ +local List = require "pl.List" local log = require "logger" +local Map = require "pl.Map" local scanner = require "web_sanitize.query.scan_html" +local tablex = require "pl.tablex" local scraper_utils = {} @@ -22,4 +25,44 @@ function scraper_utils.scrape(html, query_selector, extract_callback) return ret end +function scraper_utils.extract_nodes(html, query_selectors) + local siblings = Map() + + tablex.foreach(query_selectors, function(query_selector) + siblings:set(query_selector, List()) + end) + + scanner.scan_html(html, function(stack) + tablex.foreach(query_selectors, function(query_selector) + if stack:is(query_selector) then + siblings:get(query_selector):append(stack:current()) + end + end) + end) + + return siblings +end + +function scraper_utils.scrape_tuples(html, query_selectors, extract_callback) + local nodes = scraper_utils.extract_nodes(html, query_selectors) + + local ret = {} + + for i = 1, #nodes:get(query_selectors[1]) do + local node_list = {} + tablex.foreach(query_selectors, function(query_selector) + node_list[query_selector] = nodes:get(query_selector)[i] or nil + end) + local success, info = pcall(extract_callback, node_list) + + if not success then + log:error { message = info } + else + table.insert(ret, info) + end + end + + return ret +end + return scraper_utils