feat(scraper): move "Object properties" to AST
ci/woodpecker/pr/build Pipeline failed Details
ci/woodpecker/pr/docker-build Pipeline was successful Details
ci/woodpecker/pr/lint Pipeline failed Details

This commit is contained in:
Aire-One 2023-01-29 19:21:45 +01:00
parent ee7f986465
commit 2017fffb48
3 changed files with 252 additions and 174 deletions

View File

@ -1,14 +1,10 @@
local Function_Info = require "entity.Function_Info" local ast <const> = require("ast")
local List = require "pl.List" local type Node = require("types.Node")
local logger = require "logger" local logger = require "logger"
local Map = require "pl.Map"
local Module_Doc = require "entity.Module_Doc"
local scan = require "web_sanitize.query.scan_html" local scan = require "web_sanitize.query.scan_html"
local scraper_utils = require "scraper.utils" local scraper_utils = require "scraper.utils"
local stringx = require "pl.stringx" local stringx = require "pl.stringx"
local Type_Info = require "entity.Type_Info"
local utils = require "utils" local utils = require "utils"
local Variable_Info = require "entity.Variable_Info"
local log = logger.log("scraper") local log = logger.log("scraper")
@ -16,94 +12,93 @@ local function extract_node_text(node: scan.HTMLNode): string
return utils.sanitize_string(node:inner_text()) return utils.sanitize_string(node:inner_text())
end end
local function parse_parameter_types(parameter_type: string): List<Type_Info.Type_Info> local function parse_parameter_types(parameter_type: string): { string }
if parameter_type == "" then if parameter_type == "" then
local type_info: Type_Info.Type_Info = Type_Info("any") return { "any" }
return List({ type_info })
end end
return stringx.split(parameter_type, " or "):map( local types = {}
function(type_name: string): Type_Info.Type_Info for t in stringx.split(parameter_type, " or "):iter() do
return Type_Info(utils.sanitize_string(type_name)) table.insert(types, t)
end end
) return types
end end
local function extract_item_name(item_name_node: scan.HTMLNode): string local function extract_item_name(item_name_node: scan.HTMLNode): string
return item_name_node and ((item_name_node.attr.name as string):gsub("^.*[%.:]", "")) return item_name_node and ((item_name_node.attr.name as string):gsub("^.*[%.:]", ""))
end end
local function extract_function_parameter_Parameters(tr_node: scan.HTMLNode): { Variable_Info.Variable_Info } -- local function extract_function_parameter_Parameters(tr_node: scan.HTMLNode): { Variable_Info.Variable_Info }
local query_selectors = { -- local query_selectors = {
name = "span.parameter", -- name = "span.parameter",
types = "span.types" -- types = "span.types"
} -- }
return scraper_utils.scrape_tuples( -- return scraper_utils.scrape_tuples(
tr_node:outer_html(), -- tr_node:outer_html(),
{ query_selectors.name, query_selectors.types }, -- { query_selectors.name, query_selectors.types },
function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info -- function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info
return Variable_Info( -- return Variable_Info(
extract_node_text(nodes[query_selectors.name] as scan.HTMLNode), -- extract_node_text(nodes[query_selectors.name] as scan.HTMLNode),
parse_parameter_types(extract_node_text(nodes[query_selectors.types] as scan.HTMLNode)) -- parse_parameter_types(extract_node_text(nodes[query_selectors.types] as scan.HTMLNode))
) -- )
end) -- end)
end -- end
local function extract_function_parameters(function_parameters_node: scan.HTMLNode): { Variable_Info.Variable_Info } -- local function extract_function_parameters(function_parameters_node: scan.HTMLNode): { Variable_Info.Variable_Info }
local current_record_parameter: Type_Info.Type_Info | nil = nil -- local current_record_parameter: Type_Info.Type_Info | nil = nil
return scraper_utils.scrape( -- return scraper_utils.scrape(
function_parameters_node:outer_html(), -- function_parameters_node:outer_html(),
"tr", -- "tr",
function(line_node: scan.HTMLNode): Variable_Info.Variable_Info -- function(line_node: scan.HTMLNode): Variable_Info.Variable_Info
local parameters = extract_function_parameter_Parameters(line_node) -- local parameters = extract_function_parameter_Parameters(line_node)
if #parameters == 0 then -- if #parameters == 0 then
return nil -- return nil
elseif #parameters ~= 1 then -- elseif #parameters ~= 1 then
log:error(logger.message_with_metadata("Expected 1 parameter by <tr> node", -- log:error(logger.message_with_metadata("Expected 1 parameter by <tr> node",
{ len = #parameters, line_node = line_node, parameters = parameters })) -- { len = #parameters, line_node = line_node, parameters = parameters }))
error("Expected 1 parameter by <tr> node") -- error("Expected 1 parameter by <tr> node")
end -- end
local name, types = parameters[1].name, parameters[1].types -- local name, types = parameters[1].name, parameters[1].types
if line_node.attr ~= nil and line_node.attr.class == "see_also_sublist" and current_record_parameter then -- if line_node.attr ~= nil and line_node.attr.class == "see_also_sublist" and current_record_parameter then
local record_parameter = current_record_parameter as Type_Info.Type_Info -- local record_parameter = current_record_parameter as Type_Info.Type_Info
if not record_parameter.record_entries then -- if not record_parameter.record_entries then
record_parameter.record_entries = Map() -- record_parameter.record_entries = Map()
end -- end
(record_parameter.record_entries as Map<string, List<Type_Info.Type_Info>>):set(name, types) -- (record_parameter.record_entries as Map<string, List<Type_Info.Type_Info>>):set(name, types)
return nil -- return nil
end -- end
if #types == 1 and types[1].name == "table" then -- if #types == 1 and types[1].name == "table" then
local record_name = utils.capitalize(name) -- local record_name = utils.capitalize(name)
current_record_parameter = Type_Info(record_name) -- current_record_parameter = Type_Info(record_name)
return Variable_Info( -- return Variable_Info(
name, -- name,
List({ current_record_parameter }) -- List({ current_record_parameter })
) -- )
end -- end
return Variable_Info(name, types) -- return Variable_Info(name, types)
end) -- end)
end -- end
local function extract_function_return_types(function_return_types_node: scan.HTMLNode): List<Type_Info.Type_Info> -- local function extract_function_return_types(function_return_types_node: scan.HTMLNode): List<Type_Info.Type_Info>
if not function_return_types_node then -- if not function_return_types_node then
return {} -- return {}
end -- end
local selector = "span.types .type" -- local selector = "span.types .type"
local html = function_return_types_node:outer_html() -- local html = function_return_types_node:outer_html()
return List(scraper_utils.scrape(html, selector, extract_node_text)):map( -- return List(scraper_utils.scrape(html, selector, extract_node_text)):map(
function(type_name: string): Type_Info.Type_Info -- function(type_name: string): Type_Info.Type_Info
return Type_Info(type_name) -- return Type_Info(type_name)
end) -- end)
end -- end
local function extract_property_constraints(property_constraint_node: scan.HTMLNode): { string } local function extract_property_constraints(property_constraint_node: scan.HTMLNode): { string }
return scraper_utils.scrape( return scraper_utils.scrape(
@ -113,147 +108,180 @@ local function extract_property_constraints(property_constraint_node: scan.HTMLN
) )
end end
local function extract_section_functions(dl: string): { Function_Info.Function_Info } -- local function extract_section_functions(dl: string): { Function_Info.Function_Info }
local query_selectors = { -- local query_selectors = {
header = "dt", -- header = "dt",
name = "a", -- name = "a",
body = "dd", -- body = "dd",
parameters = "table", -- parameters = "table",
return_types = "ol", -- return_types = "ol",
} -- }
return scraper_utils.scrape_tuples( -- return scraper_utils.scrape_tuples(
dl, -- dl,
{ query_selectors.header, query_selectors.body }, -- { query_selectors.header, query_selectors.body },
function(nodes: { string : scan.HTMLNode | nil }): Function_Info.Function_Info -- function(nodes: { string : scan.HTMLNode | nil }): Function_Info.Function_Info
if not nodes[query_selectors.header] or not nodes[query_selectors.body] then -- if not nodes[query_selectors.header] or not nodes[query_selectors.body] then
log:warn( -- log:warn(
logger.message_with_metadata( -- logger.message_with_metadata(
"Missing header or body", -- "Missing header or body",
{ nodes = nodes } -- { nodes = nodes }
) -- )
) -- )
error("Missing header or body") -- error("Missing header or body")
end -- end
local header = nodes[query_selectors.header] as scan.HTMLNode -- local header = nodes[query_selectors.header] as scan.HTMLNode
local body = nodes[query_selectors.body] as scan.HTMLNode -- local body = nodes[query_selectors.body] as scan.HTMLNode
local body_elements = scraper_utils.extract_nodes( -- local body_elements = scraper_utils.extract_nodes(
body:outer_html(), -- body:outer_html(),
{ query_selectors.parameters, query_selectors.return_types } -- { query_selectors.parameters, query_selectors.return_types }
) -- )
return Function_Info( -- return Function_Info(
scraper_utils.scrape( -- scraper_utils.scrape(
header:outer_html(), -- header:outer_html(),
query_selectors.name, -- query_selectors.name,
extract_item_name -- extract_item_name
)[1], -- )[1],
#body_elements:get(query_selectors.parameters) ~= 0 and -- #body_elements:get(query_selectors.parameters) ~= 0 and
List(extract_function_parameters(body_elements:get(query_selectors.parameters)[1])) or -- List(extract_function_parameters(body_elements:get(query_selectors.parameters)[1])) or
(List() as List<Variable_Info.Variable_Info>), -- (List() as List<Variable_Info.Variable_Info>),
#body_elements:get(query_selectors.return_types) ~= 0 and -- #body_elements:get(query_selectors.return_types) ~= 0 and
extract_function_return_types(body_elements:get(query_selectors.return_types)[1]) or -- extract_function_return_types(body_elements:get(query_selectors.return_types)[1]) or
(List() as List<Type_Info.Type_Info>) -- (List() as List<Type_Info.Type_Info>)
) -- )
end -- end
) -- )
end -- end
local function extract_section_variables(dl: string): { Variable_Info.Variable_Info } local function extract_section_variables(dl: string): { Node }, { string }
local query_selectors = { local query_selectors <const>: { string : string } = {
variable_name = "dt a", variable_name = "dt a",
variable_summary_type = "dt span.summary_type", variable_summary_type = "dt span.summary_type",
variable_property_constraint = "dd span.property_type", variable_property_constraint = "dd span.property_type",
} }
return scraper_utils.scrape_tuples( local variables <const> = {}
local signals <const> = {}
for nodes in scraper_utils.iter_tuples(
dl, dl,
{ query_selectors.variable_name, query_selectors.variable_summary_type, query_selectors.variable_property_constraint }, utils.values(query_selectors)
function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info ) do
local variable_info = Variable_Info() local node = ast.create_node("variable", extract_item_name(nodes[query_selectors.variable_name]))
node.types = parse_parameter_types(extract_node_text(nodes[query_selectors.variable_summary_type]))
variable_info.name = extract_item_name(nodes[query_selectors.variable_name]) if #node.types == 1 and node.types[1] == "string" then
variable_info.types = parse_parameter_types(extract_node_text(nodes[query_selectors.variable_summary_type])) log:debug("extract variable string with constraints, this is an enum")
local type_enum <const> = ast.create_node("enum", utils.capitalize(node.name))
if #variable_info.types == 1 and variable_info.types[1].name == "string" then for _, constraint in ipairs(extract_property_constraints(nodes[query_selectors.variable_property_constraint])) do
log:debug("extract variable string with constraints, this is an enum") table.insert(
variable_info.constraints = List(extract_property_constraints(nodes[query_selectors.variable_property_constraint])):map( type_enum.children,
function(constraint: string): string ast.create_node("identifier", (constraint:gsub("&quot;", "")))
return (constraint:gsub("&quot;", ""))
end
) )
end end
table.insert(variables, type_enum)
return variable_info node.types = { type_enum.name }
end end
)
table.insert(variables, node)
table.insert(signals, string.format("property::%s", node.name)) -- TODO : actually scrape the signals from the doc
end
return variables, signals
end end
local function extract_section_signal(dl: string): { string } -- local function extract_section_signal(dl: string): { string }
local selector = "dt strong" -- local selector = "dt strong"
return scraper_utils.scrape(dl, selector, extract_node_text) -- return scraper_utils.scrape(dl, selector, extract_node_text)
end -- end
local enum Section local enum Section
"Constructors" -- "Constructors"
"Static module functions" -- "Static module functions"
"Object properties" "Object properties"
"Object methods" -- "Object methods"
"Signals" -- "Signals"
end end
local section_scrapers: { Section : function(html: string, module_doc: Module_Doc.Module_Doc) } = { -- returns
["Constructors"] = function(html: string, module_doc: Module_Doc.Module_Doc) -- - Nodes that should be added to the module
module_doc.constructors = List(extract_section_functions(html)) -- - Nodes that should be added to the global scope
end, -- - Strings that should be added to the record Signals
["Static module functions"] = function(html: string, module_doc: Module_Doc.Module_Doc) local section_scrapers <total>: { Section : function(html: string): { Node }, { Node }, { string } } = {
module_doc.static_functions = List(extract_section_functions(html)) -- ["Constructors"] = function(html: string, module_doc: Module_Doc.Module_Doc)
end, -- module_doc.constructors = List(extract_section_functions(html))
["Object properties"] = function(html: string, module_doc: Module_Doc.Module_Doc) -- end,
module_doc.properties = List(extract_section_variables(html)) -- ["Static module functions"] = function(html: string, module_doc: Module_Doc.Module_Doc)
end, -- module_doc.static_functions = List(extract_section_functions(html))
["Object methods"] = function(html: string, module_doc: Module_Doc.Module_Doc) -- end,
local self_parameter = Variable_Info("self", List({ Type_Info(module_doc.record_name) })) ["Object properties"] = function(html: string): { Node }, { Node }, { string }
module_doc.methods = List(extract_section_functions(html)):map( local properties, signals = extract_section_variables(html)
function(method: Function_Info.Function_Info): Function_Info.Function_Info return properties, {}, signals
method.parameters:insert(1, self_parameter)
return method
end
)
end,
["Signals"] = function(html: string, module_doc: Module_Doc.Module_Doc)
module_doc.signals = List(extract_section_signal(html))
end, end,
-- ["Object methods"] = function(html: string, module_doc: Module_Doc.Module_Doc)
-- local self_parameter = Variable_Info("self", List({ Type_Info(module_doc.record_name) }))
-- module_doc.methods = List(extract_section_functions(html)):map(
-- function(method: Function_Info.Function_Info): Function_Info.Function_Info
-- method.parameters:insert(1, self_parameter)
-- return method
-- end
-- )
-- end,
-- ["Signals"] = function(html: string, module_doc: Module_Doc.Module_Doc)
-- module_doc.signals = List(extract_section_signal(html))
-- end,
} }
-- local function extract_node_module_name(node: Node): string
-- return (node.name:gsub("(.*)[%.:].+$", "%1"))
-- end
local module = {} local module = {}
function module.get_doc_from_page(html: string, module_name: string): Module_Doc.Module_Doc function module.get_doc_from_page(html: string, module_name: string): Node, { Node }
local nodes = scraper_utils.extract_nodes(html, { local html_nodes = scraper_utils.extract_nodes(html, {
"h2.section-header", "h2.section-header",
"dl.function", "dl.function",
}) })
if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then if #html_nodes:get "h2.section-header" ~= #html_nodes:get "dl.function" then
error "The list aren't the same size!" error "The list aren't the same size!"
end end
local module_doc = Module_Doc() local record_name <const> = utils.capitalize((module_name:gsub(".*%.", "")))
module_doc.record_name = utils.capitalize((module_name:gsub(".*%.", ""))) local module_root <const> = ast.create_node("module", record_name)
local other_nodes <const>: { Node } = {}
for i = 1, #nodes:get("h2.section-header") do local module_signals_node <const> = ast.create_node("enum", "Signal")
local h2 = nodes:get("h2.section-header")[i] table.insert(module_root.children, module_signals_node)
for i = 1, #html_nodes:get("h2.section-header") do
local h2 = html_nodes:get("h2.section-header")[i]
local section_name = utils.sanitize_string(h2:inner_text()) as Section -- promote to Section, we then test if the section_name is in the table local section_name = utils.sanitize_string(h2:inner_text()) as Section -- promote to Section, we then test if the section_name is in the table
local dl_html = nodes:get("dl.function")[i]:outer_html() local dl_html = html_nodes:get("dl.function")[i]:outer_html()
if section_scrapers[section_name] then if section_scrapers[section_name] then
section_scrapers[section_name](dl_html, module_doc) local module_nodes, global_nodes, signals_name = section_scrapers[section_name](dl_html)
for _, node in ipairs(module_nodes) do
table.insert(module_root.children, node)
end
for _, node in ipairs(global_nodes) do
table.insert(other_nodes, node)
end
print(signals_name)
for _, signal_name in ipairs(signals_name) do
table.insert(
module_signals_node.children,
ast.create_node("identifier", signal_name)
)
end
else else
log:warn("Section scraper not implemented: " .. section_name) log:warn("Section scraper not implemented: " .. section_name)
end end
end end
return module_doc return module_root, other_nodes
end end
return module return module

View File

@ -72,4 +72,44 @@ function scraper_utils.scrape_tuples<T>(html: string, query_selectors: { string
return ret return ret
end end
function scraper_utils.iter_tuples(html: string, query_selectors: { string }): function(): { string : scan.HTMLNode }
local siblings: { string : { scan.HTMLNode } } = {}
for _, query_selector in ipairs(query_selectors) do
siblings[query_selector] = {}
end
scanner.scan_html(
html,
function(stack: scan.NodeStack)
for _, query_selector in ipairs(query_selectors) do
if stack:is(query_selector) then
table.insert(siblings[query_selector], stack:current())
end
end
end
)
local siblings_count = #siblings[query_selectors[1]]
for _, query_selector in ipairs(query_selectors) do
if #siblings[query_selector] ~= siblings_count then
error("Query selectors do not have the same number of siblings")
end
end
local i = 0
return function(): { string : scan.HTMLNode }
i = i + 1
if i > siblings_count then
return nil
end
local node_list: { string : scan.HTMLNode } = {}
for _, query_selector in ipairs(query_selectors) do
node_list[query_selector] = siblings[query_selector][i]
end
return node_list
end
end
return scraper_utils return scraper_utils

View File

@ -35,6 +35,16 @@ function utils.map<T, U>(list: { T }, iteratee: function(value: T, position: int
return mapped return mapped
end end
function utils.values<T>(t: table): { T }
local values: { T } = {}
for _, v in pairs(t) do
table.insert(values, v as T)
end
return values
end
function utils.sanitize_string(s: string): string function utils.sanitize_string(s: string): string
return (stringx.strip(web_sanitize.extract_text(s))) return (stringx.strip(web_sanitize.extract_text(s)))
end end