From 7e225cf4526175e22c4408ab7d2a79bba19dff28 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Sun, 29 Jan 2023 19:21:45 +0100 Subject: [PATCH] feat(scraper): move "Object properties" to AST --- src/awesomewm.d.tl/scraper/module_doc.tl | 375 ++++++++++++----------- src/awesomewm.d.tl/scraper/utils.tl | 40 +++ src/awesomewm.d.tl/utils.tl | 10 + 3 files changed, 251 insertions(+), 174 deletions(-) diff --git a/src/awesomewm.d.tl/scraper/module_doc.tl b/src/awesomewm.d.tl/scraper/module_doc.tl index 6c7bd66..ea867c4 100644 --- a/src/awesomewm.d.tl/scraper/module_doc.tl +++ b/src/awesomewm.d.tl/scraper/module_doc.tl @@ -1,14 +1,10 @@ -local Function_Info = require "entity.Function_Info" -local List = require "pl.List" +local ast = require("ast") +local type Node = require("types.Node") local logger = require "logger" -local Map = require "pl.Map" -local Module_Doc = require "entity.Module_Doc" local scan = require "web_sanitize.query.scan_html" local scraper_utils = require "scraper.utils" local stringx = require "pl.stringx" -local Type_Info = require "entity.Type_Info" local utils = require "utils" -local Variable_Info = require "entity.Variable_Info" local log = logger.log("scraper") @@ -16,94 +12,93 @@ local function extract_node_text(node: scan.HTMLNode): string return utils.sanitize_string(node:inner_text()) end -local function parse_parameter_types(parameter_type: string): List +local function parse_parameter_types(parameter_type: string): { string } if parameter_type == "" then - local type_info: Type_Info.Type_Info = Type_Info("any") - return List({ type_info }) + return { "any" } end - return stringx.split(parameter_type, " or "):map( - function(type_name: string): Type_Info.Type_Info - return Type_Info(utils.sanitize_string(type_name)) - end - ) + local types = {} + for t in stringx.split(parameter_type, " or "):iter() do + table.insert(types, t) + end + return types end local function extract_item_name(item_name_node: scan.HTMLNode): string return item_name_node and ((item_name_node.attr.name as string):gsub("^.*[%.:]", "")) end -local function extract_function_parameter_Parameters(tr_node: scan.HTMLNode): { Variable_Info.Variable_Info } - local query_selectors = { - name = "span.parameter", - types = "span.types" - } +-- local function extract_function_parameter_Parameters(tr_node: scan.HTMLNode): { Variable_Info.Variable_Info } +-- local query_selectors = { +-- name = "span.parameter", +-- types = "span.types" +-- } - return scraper_utils.scrape_tuples( - tr_node:outer_html(), - { query_selectors.name, query_selectors.types }, - function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info - return Variable_Info( - extract_node_text(nodes[query_selectors.name] as scan.HTMLNode), - parse_parameter_types(extract_node_text(nodes[query_selectors.types] as scan.HTMLNode)) - ) - end) -end +-- return scraper_utils.scrape_tuples( +-- tr_node:outer_html(), +-- { query_selectors.name, query_selectors.types }, +-- function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info +-- return Variable_Info( +-- extract_node_text(nodes[query_selectors.name] as scan.HTMLNode), +-- parse_parameter_types(extract_node_text(nodes[query_selectors.types] as scan.HTMLNode)) +-- ) +-- end) +-- end -local function extract_function_parameters(function_parameters_node: scan.HTMLNode): { Variable_Info.Variable_Info } - local current_record_parameter: Type_Info.Type_Info | nil = nil +-- local function extract_function_parameters(function_parameters_node: scan.HTMLNode): { Variable_Info.Variable_Info } +-- local current_record_parameter: Type_Info.Type_Info | nil = nil - return scraper_utils.scrape( - function_parameters_node:outer_html(), - "tr", - function(line_node: scan.HTMLNode): Variable_Info.Variable_Info - local parameters = extract_function_parameter_Parameters(line_node) - if #parameters == 0 then - return nil - elseif #parameters ~= 1 then - log:error(logger.message_with_metadata("Expected 1 parameter by node", - { len = #parameters, line_node = line_node, parameters = parameters })) - error("Expected 1 parameter by node") - end - local name, types = parameters[1].name, parameters[1].types +-- return scraper_utils.scrape( +-- function_parameters_node:outer_html(), +-- "tr", +-- function(line_node: scan.HTMLNode): Variable_Info.Variable_Info +-- local parameters = extract_function_parameter_Parameters(line_node) +-- if #parameters == 0 then +-- return nil +-- elseif #parameters ~= 1 then +-- log:error(logger.message_with_metadata("Expected 1 parameter by node", +-- { len = #parameters, line_node = line_node, parameters = parameters })) +-- error("Expected 1 parameter by node") +-- end +-- local name, types = parameters[1].name, parameters[1].types - if line_node.attr ~= nil and line_node.attr.class == "see_also_sublist" and current_record_parameter then - local record_parameter = current_record_parameter as Type_Info.Type_Info - if not record_parameter.record_entries then - record_parameter.record_entries = Map() - end +-- if line_node.attr ~= nil and line_node.attr.class == "see_also_sublist" and current_record_parameter then +-- local record_parameter = current_record_parameter as Type_Info.Type_Info +-- if not record_parameter.record_entries then +-- record_parameter.record_entries = Map() +-- end - (record_parameter.record_entries as Map>):set(name, types) +-- (record_parameter.record_entries as Map>):set(name, types) - return nil - end +-- return nil +-- end - if #types == 1 and types[1].name == "table" then - local record_name = utils.capitalize(name) - current_record_parameter = Type_Info(record_name) - return Variable_Info( - name, - List({ current_record_parameter }) - ) - end +-- if #types == 1 and types[1].name == "table" then +-- local record_name = utils.capitalize(name) +-- current_record_parameter = Type_Info(record_name) +-- return Variable_Info( +-- name, +-- List({ current_record_parameter }) +-- ) +-- end - return Variable_Info(name, types) - end) -end +-- return Variable_Info(name, types) +-- end) +-- end -local function extract_function_return_types(function_return_types_node: scan.HTMLNode): List - if not function_return_types_node then - return {} - end +-- local function extract_function_return_types(function_return_types_node: scan.HTMLNode): List +-- if not function_return_types_node then +-- return {} +-- end - local selector = "span.types .type" - local html = function_return_types_node:outer_html() +-- local selector = "span.types .type" +-- local html = function_return_types_node:outer_html() - return List(scraper_utils.scrape(html, selector, extract_node_text)):map( - function(type_name: string): Type_Info.Type_Info - return Type_Info(type_name) - end) -end +-- return List(scraper_utils.scrape(html, selector, extract_node_text)):map( +-- function(type_name: string): Type_Info.Type_Info +-- return Type_Info(type_name) +-- end) +-- end local function extract_property_constraints(property_constraint_node: scan.HTMLNode): { string } return scraper_utils.scrape( @@ -113,147 +108,179 @@ local function extract_property_constraints(property_constraint_node: scan.HTMLN ) end -local function extract_section_functions(dl: string): { Function_Info.Function_Info } - local query_selectors = { - header = "dt", - name = "a", - body = "dd", - parameters = "table", - return_types = "ol", - } +-- local function extract_section_functions(dl: string): { Function_Info.Function_Info } +-- local query_selectors = { +-- header = "dt", +-- name = "a", +-- body = "dd", +-- parameters = "table", +-- return_types = "ol", +-- } - return scraper_utils.scrape_tuples( - dl, - { query_selectors.header, query_selectors.body }, - function(nodes: { string : scan.HTMLNode | nil }): Function_Info.Function_Info - if not nodes[query_selectors.header] or not nodes[query_selectors.body] then - log:warn( - logger.message_with_metadata( - "Missing header or body", - { nodes = nodes } - ) - ) - error("Missing header or body") - end - local header = nodes[query_selectors.header] as scan.HTMLNode - local body = nodes[query_selectors.body] as scan.HTMLNode - local body_elements = scraper_utils.extract_nodes( - body:outer_html(), - { query_selectors.parameters, query_selectors.return_types } - ) - return Function_Info( - scraper_utils.scrape( - header:outer_html(), - query_selectors.name, - extract_item_name - )[1], - #body_elements:get(query_selectors.parameters) ~= 0 and - List(extract_function_parameters(body_elements:get(query_selectors.parameters)[1])) or - (List() as List), - #body_elements:get(query_selectors.return_types) ~= 0 and - extract_function_return_types(body_elements:get(query_selectors.return_types)[1]) or - (List() as List) - ) - end - ) -end +-- return scraper_utils.scrape_tuples( +-- dl, +-- { query_selectors.header, query_selectors.body }, +-- function(nodes: { string : scan.HTMLNode | nil }): Function_Info.Function_Info +-- if not nodes[query_selectors.header] or not nodes[query_selectors.body] then +-- log:warn( +-- logger.message_with_metadata( +-- "Missing header or body", +-- { nodes = nodes } +-- ) +-- ) +-- error("Missing header or body") +-- end +-- local header = nodes[query_selectors.header] as scan.HTMLNode +-- local body = nodes[query_selectors.body] as scan.HTMLNode +-- local body_elements = scraper_utils.extract_nodes( +-- body:outer_html(), +-- { query_selectors.parameters, query_selectors.return_types } +-- ) +-- return Function_Info( +-- scraper_utils.scrape( +-- header:outer_html(), +-- query_selectors.name, +-- extract_item_name +-- )[1], +-- #body_elements:get(query_selectors.parameters) ~= 0 and +-- List(extract_function_parameters(body_elements:get(query_selectors.parameters)[1])) or +-- (List() as List), +-- #body_elements:get(query_selectors.return_types) ~= 0 and +-- extract_function_return_types(body_elements:get(query_selectors.return_types)[1]) or +-- (List() as List) +-- ) +-- end +-- ) +-- end -local function extract_section_variables(dl: string): { Variable_Info.Variable_Info } - local query_selectors = { +local function extract_section_variables(dl: string): { Node }, { string } + local query_selectors : { string : string } = { variable_name = "dt a", variable_summary_type = "dt span.summary_type", variable_property_constraint = "dd span.property_type", } - return scraper_utils.scrape_tuples( + local variables = {} + local signals = {} + + for nodes in scraper_utils.iter_tuples( dl, - { query_selectors.variable_name, query_selectors.variable_summary_type, query_selectors.variable_property_constraint }, - function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info - local variable_info = Variable_Info() + utils.values(query_selectors) + ) do + local node = ast.create_node("variable", extract_item_name(nodes[query_selectors.variable_name])) + node.types = parse_parameter_types(extract_node_text(nodes[query_selectors.variable_summary_type])) - variable_info.name = extract_item_name(nodes[query_selectors.variable_name]) - variable_info.types = parse_parameter_types(extract_node_text(nodes[query_selectors.variable_summary_type])) - - if #variable_info.types == 1 and variable_info.types[1].name == "string" then - log:debug("extract variable string with constraints, this is an enum") - variable_info.constraints = List(extract_property_constraints(nodes[query_selectors.variable_property_constraint])):map( - function(constraint: string): string - return (constraint:gsub(""", "")) - end + if #node.types == 1 and node.types[1] == "string" then + log:debug("extract variable string with constraints, this is an enum") + local type_enum = ast.create_node("enum", utils.capitalize(node.name)) + for _, constraint in ipairs(extract_property_constraints(nodes[query_selectors.variable_property_constraint])) do + table.insert( + type_enum.children, + ast.create_node("identifier", (constraint:gsub(""", ""))) ) end - - return variable_info + table.insert(variables, type_enum) + node.types = { type_enum.name } end - ) + + table.insert(variables, node) + table.insert(signals, string.format("property::%s", node.name)) -- TODO : actually scrape the signals from the doc + end + + return variables, signals end -local function extract_section_signal(dl: string): { string } - local selector = "dt strong" +-- local function extract_section_signal(dl: string): { string } +-- local selector = "dt strong" - return scraper_utils.scrape(dl, selector, extract_node_text) -end +-- return scraper_utils.scrape(dl, selector, extract_node_text) +-- end local enum Section - "Constructors" - "Static module functions" + -- "Constructors" + -- "Static module functions" "Object properties" - "Object methods" - "Signals" + -- "Object methods" + -- "Signals" end -local section_scrapers: { Section : function(html: string, module_doc: Module_Doc.Module_Doc) } = { - ["Constructors"] = function(html: string, module_doc: Module_Doc.Module_Doc) - module_doc.constructors = List(extract_section_functions(html)) - end, - ["Static module functions"] = function(html: string, module_doc: Module_Doc.Module_Doc) - module_doc.static_functions = List(extract_section_functions(html)) - end, - ["Object properties"] = function(html: string, module_doc: Module_Doc.Module_Doc) - module_doc.properties = List(extract_section_variables(html)) - end, - ["Object methods"] = function(html: string, module_doc: Module_Doc.Module_Doc) - local self_parameter = Variable_Info("self", List({ Type_Info(module_doc.record_name) })) - module_doc.methods = List(extract_section_functions(html)):map( - function(method: Function_Info.Function_Info): Function_Info.Function_Info - method.parameters:insert(1, self_parameter) - return method - end - ) - end, - ["Signals"] = function(html: string, module_doc: Module_Doc.Module_Doc) - module_doc.signals = List(extract_section_signal(html)) +-- returns +-- - Nodes that should be added to the module +-- - Nodes that should be added to the global scope +-- - Strings that should be added to the record Signals +local section_scrapers : { Section : function(html: string): { Node }, { Node }, { string } } = { + -- ["Constructors"] = function(html: string, module_doc: Module_Doc.Module_Doc) + -- module_doc.constructors = List(extract_section_functions(html)) + -- end, + -- ["Static module functions"] = function(html: string, module_doc: Module_Doc.Module_Doc) + -- module_doc.static_functions = List(extract_section_functions(html)) + -- end, + ["Object properties"] = function(html: string): { Node }, { Node }, { string } + local properties, signals = extract_section_variables(html) + return properties, {}, signals end, + -- ["Object methods"] = function(html: string, module_doc: Module_Doc.Module_Doc) + -- local self_parameter = Variable_Info("self", List({ Type_Info(module_doc.record_name) })) + -- module_doc.methods = List(extract_section_functions(html)):map( + -- function(method: Function_Info.Function_Info): Function_Info.Function_Info + -- method.parameters:insert(1, self_parameter) + -- return method + -- end + -- ) + -- end, + -- ["Signals"] = function(html: string, module_doc: Module_Doc.Module_Doc) + -- module_doc.signals = List(extract_section_signal(html)) + -- end, } +-- local function extract_node_module_name(node: Node): string +-- return (node.name:gsub("(.*)[%.:].+$", "%1")) +-- end + local module = {} -function module.get_doc_from_page(html: string, module_name: string): Module_Doc.Module_Doc - local nodes = scraper_utils.extract_nodes(html, { +function module.get_doc_from_page(html: string, module_name: string): Node, { Node } + local html_nodes = scraper_utils.extract_nodes(html, { "h2.section-header", "dl.function", }) - if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then + if #html_nodes:get "h2.section-header" ~= #html_nodes:get "dl.function" then error "The list aren't the same size!" end - local module_doc = Module_Doc() - module_doc.record_name = utils.capitalize((module_name:gsub(".*%.", ""))) + local record_name = utils.capitalize((module_name:gsub(".*%.", ""))) + local module_root = ast.create_node("module", record_name) + local other_nodes : { Node } = {} - for i = 1, #nodes:get("h2.section-header") do - local h2 = nodes:get("h2.section-header")[i] + local module_signals_node = ast.create_node("enum", "Signal") + table.insert(module_root.children, module_signals_node) + + for i = 1, #html_nodes:get("h2.section-header") do + local h2 = html_nodes:get("h2.section-header")[i] local section_name = utils.sanitize_string(h2:inner_text()) as Section -- promote to Section, we then test if the section_name is in the table - local dl_html = nodes:get("dl.function")[i]:outer_html() + local dl_html = html_nodes:get("dl.function")[i]:outer_html() if section_scrapers[section_name] then - section_scrapers[section_name](dl_html, module_doc) + local module_nodes, global_nodes, signals_name = section_scrapers[section_name](dl_html) + for _, node in ipairs(module_nodes) do + table.insert(module_root.children, node) + end + for _, node in ipairs(global_nodes) do + table.insert(other_nodes, node) + end + for _, signal_name in ipairs(signals_name) do + table.insert( + module_signals_node.children, + ast.create_node("identifier", signal_name) + ) + end else log:warn("Section scraper not implemented: " .. section_name) end end - return module_doc + return module_root, other_nodes end return module diff --git a/src/awesomewm.d.tl/scraper/utils.tl b/src/awesomewm.d.tl/scraper/utils.tl index 26ff900..33032ca 100644 --- a/src/awesomewm.d.tl/scraper/utils.tl +++ b/src/awesomewm.d.tl/scraper/utils.tl @@ -72,4 +72,44 @@ function scraper_utils.scrape_tuples(html: string, query_selectors: { string return ret end +function scraper_utils.iter_tuples(html: string, query_selectors: { string }): function(): { string : scan.HTMLNode } + local siblings: { string : { scan.HTMLNode } } = {} + for _, query_selector in ipairs(query_selectors) do + siblings[query_selector] = {} + end + + scanner.scan_html( + html, + function(stack: scan.NodeStack) + for _, query_selector in ipairs(query_selectors) do + if stack:is(query_selector) then + table.insert(siblings[query_selector], stack:current()) + end + end + end + ) + + local siblings_count = #siblings[query_selectors[1]] + for _, query_selector in ipairs(query_selectors) do + if #siblings[query_selector] ~= siblings_count then + error("Query selectors do not have the same number of siblings") + end + end + + local i = 0 + return function(): { string : scan.HTMLNode } + i = i + 1 + if i > siblings_count then + return nil + end + + local node_list: { string : scan.HTMLNode } = {} + for _, query_selector in ipairs(query_selectors) do + node_list[query_selector] = siblings[query_selector][i] + end + + return node_list + end +end + return scraper_utils diff --git a/src/awesomewm.d.tl/utils.tl b/src/awesomewm.d.tl/utils.tl index 828070e..824b9dc 100644 --- a/src/awesomewm.d.tl/utils.tl +++ b/src/awesomewm.d.tl/utils.tl @@ -35,6 +35,16 @@ function utils.map(list: { T }, iteratee: function(value: T, position: int return mapped end +function utils.values(t: table): { T } + local values: { T } = {} + + for _, v in pairs(t) do + table.insert(values, v as T) + end + + return values +end + function utils.sanitize_string(s: string): string return (stringx.strip(web_sanitize.extract_text(s))) end