feat(scraper): implement all `section_scrapers`
This commit is contained in:
parent
11721a3331
commit
de8c7c682b
|
@ -32,6 +32,7 @@
|
||||||
"setopt",
|
"setopt",
|
||||||
"stringx",
|
"stringx",
|
||||||
"Stylua",
|
"Stylua",
|
||||||
|
"sublist",
|
||||||
"tablex",
|
"tablex",
|
||||||
"tmpl",
|
"tmpl",
|
||||||
"wibox",
|
"wibox",
|
||||||
|
|
|
@ -28,77 +28,50 @@ local function extract_item_name(item_name_node: scan.HTMLNode): string
|
||||||
return item_name_node and ((item_name_node.attr.name as string):gsub("^.*[%.:]", ""))
|
return item_name_node and ((item_name_node.attr.name as string):gsub("^.*[%.:]", ""))
|
||||||
end
|
end
|
||||||
|
|
||||||
-- local function extract_function_parameter_Parameters(tr_node: scan.HTMLNode): { Variable_Info.Variable_Info }
|
local function extract_function_parameters(table_html: string): { Node }
|
||||||
-- local query_selectors = {
|
local current_record_parameter: Node = nil
|
||||||
-- name = "span.parameter",
|
|
||||||
-- types = "span.types"
|
|
||||||
-- }
|
|
||||||
|
|
||||||
-- return scraper_utils.scrape_tuples(
|
return scraper_utils.scrape(table_html, "tr", function(tr: scan.HTMLNode): Node
|
||||||
-- tr_node:outer_html(),
|
local tr_html = tr:outer_html()
|
||||||
-- { query_selectors.name, query_selectors.types },
|
local name_node <const> = scraper_utils.find(tr_html, "span.parameter")[1]
|
||||||
-- function(nodes: { string : scan.HTMLNode | nil }): Variable_Info.Variable_Info
|
local types_node <const> = scraper_utils.find(tr_html, "span.types")[1]
|
||||||
-- return Variable_Info(
|
if not name_node or not types_node then
|
||||||
-- extract_node_text(nodes[query_selectors.name] as scan.HTMLNode),
|
return nil
|
||||||
-- parse_parameter_types(extract_node_text(nodes[query_selectors.types] as scan.HTMLNode))
|
end
|
||||||
-- )
|
|
||||||
-- end)
|
|
||||||
-- end
|
|
||||||
|
|
||||||
-- local function extract_function_parameters(function_parameters_node: scan.HTMLNode): { Variable_Info.Variable_Info }
|
local name <const> = extract_node_text(name_node)
|
||||||
-- local current_record_parameter: Type_Info.Type_Info | nil = nil
|
local types <const> = parse_parameter_types(extract_node_text(types_node))
|
||||||
|
|
||||||
-- return scraper_utils.scrape(
|
if tr.attr ~= nil and tr.attr.class == "see_also_sublist" and current_record_parameter then
|
||||||
-- function_parameters_node:outer_html(),
|
local field = ast.create_node("variable", name)
|
||||||
-- "tr",
|
field.types = types
|
||||||
-- function(line_node: scan.HTMLNode): Variable_Info.Variable_Info
|
table.insert(current_record_parameter.children, field)
|
||||||
-- local parameters = extract_function_parameter_Parameters(line_node)
|
return nil
|
||||||
-- if #parameters == 0 then
|
end
|
||||||
-- return nil
|
|
||||||
-- elseif #parameters ~= 1 then
|
|
||||||
-- log:error(logger.message_with_metadata("Expected 1 parameter by <tr> node",
|
|
||||||
-- { len = #parameters, line_node = line_node, parameters = parameters }))
|
|
||||||
-- error("Expected 1 parameter by <tr> node")
|
|
||||||
-- end
|
|
||||||
-- local name, types = parameters[1].name, parameters[1].types
|
|
||||||
|
|
||||||
-- if line_node.attr ~= nil and line_node.attr.class == "see_also_sublist" and current_record_parameter then
|
-- We wrongly tried to convert a table to a record
|
||||||
-- local record_parameter = current_record_parameter as Type_Info.Type_Info
|
if current_record_parameter then
|
||||||
-- if not record_parameter.record_entries then
|
current_record_parameter.token = "variable"
|
||||||
-- record_parameter.record_entries = Map()
|
current_record_parameter.name = utils.lowercase(current_record_parameter.name)
|
||||||
-- end
|
current_record_parameter.types = { "table" }
|
||||||
|
current_record_parameter.children = nil
|
||||||
|
current_record_parameter = nil
|
||||||
|
end
|
||||||
|
|
||||||
-- (record_parameter.record_entries as Map<string, List<Type_Info.Type_Info>>):set(name, types)
|
if #types == 1 and types[1] == "table" then
|
||||||
|
current_record_parameter = ast.create_node("record", utils.capitalize(name))
|
||||||
|
return current_record_parameter
|
||||||
|
end
|
||||||
|
|
||||||
-- return nil
|
local field = ast.create_node("variable", name)
|
||||||
-- end
|
field.types = types
|
||||||
|
return field
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
-- if #types == 1 and types[1].name == "table" then
|
local function extract_function_return_types(ol_html: string): { string }
|
||||||
-- local record_name = utils.capitalize(name)
|
return scraper_utils.scrape(ol_html, "span.types .type", extract_node_text)
|
||||||
-- current_record_parameter = Type_Info(record_name)
|
end
|
||||||
-- return Variable_Info(
|
|
||||||
-- name,
|
|
||||||
-- List({ current_record_parameter })
|
|
||||||
-- )
|
|
||||||
-- end
|
|
||||||
|
|
||||||
-- return Variable_Info(name, types)
|
|
||||||
-- end)
|
|
||||||
-- end
|
|
||||||
|
|
||||||
-- local function extract_function_return_types(function_return_types_node: scan.HTMLNode): List<Type_Info.Type_Info>
|
|
||||||
-- if not function_return_types_node then
|
|
||||||
-- return {}
|
|
||||||
-- end
|
|
||||||
|
|
||||||
-- local selector = "span.types .type"
|
|
||||||
-- local html = function_return_types_node:outer_html()
|
|
||||||
|
|
||||||
-- return List(scraper_utils.scrape(html, selector, extract_node_text)):map(
|
|
||||||
-- function(type_name: string): Type_Info.Type_Info
|
|
||||||
-- return Type_Info(type_name)
|
|
||||||
-- end)
|
|
||||||
-- end
|
|
||||||
|
|
||||||
local function extract_property_constraints(property_constraint_node: scan.HTMLNode): { string }
|
local function extract_property_constraints(property_constraint_node: scan.HTMLNode): { string }
|
||||||
return scraper_utils.scrape(
|
return scraper_utils.scrape(
|
||||||
|
@ -108,50 +81,40 @@ local function extract_property_constraints(property_constraint_node: scan.HTMLN
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
-- local function extract_section_functions(dl: string): { Function_Info.Function_Info }
|
local function extract_section_functions(dl: string): { Node }
|
||||||
-- local query_selectors = {
|
local list_query_selectors <const>: { string : string } = {
|
||||||
-- header = "dt",
|
function_name = "dt a",
|
||||||
-- name = "a",
|
body = "dd",
|
||||||
-- body = "dd",
|
}
|
||||||
-- parameters = "table",
|
|
||||||
-- return_types = "ol",
|
|
||||||
-- }
|
|
||||||
|
|
||||||
-- return scraper_utils.scrape_tuples(
|
local functions<const>: { Node } = {}
|
||||||
-- dl,
|
|
||||||
-- { query_selectors.header, query_selectors.body },
|
for nodes in scraper_utils.iter_tuples(
|
||||||
-- function(nodes: { string : scan.HTMLNode | nil }): Function_Info.Function_Info
|
dl,
|
||||||
-- if not nodes[query_selectors.header] or not nodes[query_selectors.body] then
|
utils.values(list_query_selectors)
|
||||||
-- log:warn(
|
) do
|
||||||
-- logger.message_with_metadata(
|
local function_node <const> = ast.create_node(
|
||||||
-- "Missing header or body",
|
"function",
|
||||||
-- { nodes = nodes }
|
extract_item_name(nodes[list_query_selectors.function_name])
|
||||||
-- )
|
)
|
||||||
-- )
|
|
||||||
-- error("Missing header or body")
|
local body_html = nodes[list_query_selectors.body]:outer_html()
|
||||||
-- end
|
|
||||||
-- local header = nodes[query_selectors.header] as scan.HTMLNode
|
local parameter_node = scraper_utils.find(body_html, "table")[1]
|
||||||
-- local body = nodes[query_selectors.body] as scan.HTMLNode
|
function_node.parameters = parameter_node and
|
||||||
-- local body_elements = scraper_utils.extract_nodes(
|
extract_function_parameters(parameter_node:outer_html()) or
|
||||||
-- body:outer_html(),
|
{}
|
||||||
-- { query_selectors.parameters, query_selectors.return_types }
|
|
||||||
-- )
|
local return_node = scraper_utils.find(body_html, "ol")[1]
|
||||||
-- return Function_Info(
|
function_node.return_types = return_node and
|
||||||
-- scraper_utils.scrape(
|
extract_function_return_types(return_node:outer_html()) or
|
||||||
-- header:outer_html(),
|
{}
|
||||||
-- query_selectors.name,
|
|
||||||
-- extract_item_name
|
table.insert(functions, function_node)
|
||||||
-- )[1],
|
end
|
||||||
-- #body_elements:get(query_selectors.parameters) ~= 0 and
|
|
||||||
-- List(extract_function_parameters(body_elements:get(query_selectors.parameters)[1])) or
|
return functions
|
||||||
-- (List() as List<Variable_Info.Variable_Info>),
|
end
|
||||||
-- #body_elements:get(query_selectors.return_types) ~= 0 and
|
|
||||||
-- extract_function_return_types(body_elements:get(query_selectors.return_types)[1]) or
|
|
||||||
-- (List() as List<Type_Info.Type_Info>)
|
|
||||||
-- )
|
|
||||||
-- end
|
|
||||||
-- )
|
|
||||||
-- end
|
|
||||||
|
|
||||||
local function extract_section_variables(dl: string): { Node }, { string }
|
local function extract_section_variables(dl: string): { Node }, { string }
|
||||||
local query_selectors <const>: { string : string } = {
|
local query_selectors <const>: { string : string } = {
|
||||||
|
@ -160,8 +123,8 @@ local function extract_section_variables(dl: string): { Node }, { string }
|
||||||
variable_property_constraint = "dd span.property_type",
|
variable_property_constraint = "dd span.property_type",
|
||||||
}
|
}
|
||||||
|
|
||||||
local variables <const> = {}
|
local variables <const>: { Node } = {}
|
||||||
local signals <const> = {}
|
local signals <const>: { string } = {}
|
||||||
|
|
||||||
for nodes in scraper_utils.iter_tuples(
|
for nodes in scraper_utils.iter_tuples(
|
||||||
dl,
|
dl,
|
||||||
|
@ -197,10 +160,10 @@ local function extract_section_signal(dl: string): { string }
|
||||||
end
|
end
|
||||||
|
|
||||||
local enum Section
|
local enum Section
|
||||||
-- "Constructors"
|
"Constructors"
|
||||||
-- "Static module functions"
|
"Static module functions"
|
||||||
"Object properties"
|
"Object properties"
|
||||||
-- "Object methods"
|
"Object methods"
|
||||||
"Signals"
|
"Signals"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -208,26 +171,27 @@ end
|
||||||
-- - Nodes that should be added to the module
|
-- - Nodes that should be added to the module
|
||||||
-- - Nodes that should be added to the global scope
|
-- - Nodes that should be added to the global scope
|
||||||
-- - Strings that should be added to the record Signals
|
-- - Strings that should be added to the record Signals
|
||||||
local section_scrapers <total>: { Section : function(html: string): { Node }, { Node }, { string } } = {
|
local section_scrapers <total>: { Section : function(html: string, module_name: string): { Node }, { Node }, { string } } = {
|
||||||
-- ["Constructors"] = function(html: string, module_doc: Module_Doc.Module_Doc)
|
["Constructors"] = function(html: string): { Node }, { Node }, { string }
|
||||||
-- module_doc.constructors = List(extract_section_functions(html))
|
return extract_section_functions(html), {}, {}
|
||||||
-- end,
|
end,
|
||||||
-- ["Static module functions"] = function(html: string, module_doc: Module_Doc.Module_Doc)
|
["Static module functions"] = function(html: string): { Node }, { Node }, { string }
|
||||||
-- module_doc.static_functions = List(extract_section_functions(html))
|
local static_functions = extract_section_functions(html)
|
||||||
-- end,
|
return static_functions, {}, {}
|
||||||
|
end,
|
||||||
["Object properties"] = function(html: string): { Node }, { Node }, { string }
|
["Object properties"] = function(html: string): { Node }, { Node }, { string }
|
||||||
local properties, signals = extract_section_variables(html)
|
local properties, signals = extract_section_variables(html)
|
||||||
return properties, {}, signals
|
return properties, {}, signals
|
||||||
end,
|
end,
|
||||||
-- ["Object methods"] = function(html: string, module_doc: Module_Doc.Module_Doc)
|
["Object methods"] = function(html: string, module_name: string): { Node }, { Node }, { string }
|
||||||
-- local self_parameter = Variable_Info("self", List({ Type_Info(module_doc.record_name) }))
|
local methods <const> = extract_section_functions(html)
|
||||||
-- module_doc.methods = List(extract_section_functions(html)):map(
|
for _, method in ipairs(methods) do
|
||||||
-- function(method: Function_Info.Function_Info): Function_Info.Function_Info
|
local self_parameter = ast.create_node("variable", "self")
|
||||||
-- method.parameters:insert(1, self_parameter)
|
self_parameter.types = { module_name }
|
||||||
-- return method
|
table.insert(method.parameters, 1, self_parameter)
|
||||||
-- end
|
end
|
||||||
-- )
|
return methods, {}, {}
|
||||||
-- end,
|
end,
|
||||||
["Signals"] = function(html: string): { Node }, { Node }, { string }
|
["Signals"] = function(html: string): { Node }, { Node }, { string }
|
||||||
local signals = extract_section_signal(html)
|
local signals = extract_section_signal(html)
|
||||||
return {}, {}, signals
|
return {}, {}, signals
|
||||||
|
@ -263,7 +227,7 @@ function module.get_doc_from_page(html: string, module_name: string): Node, { No
|
||||||
local dl_html = html_nodes:get("dl.function")[i]:outer_html()
|
local dl_html = html_nodes:get("dl.function")[i]:outer_html()
|
||||||
|
|
||||||
if section_scrapers[section_name] then
|
if section_scrapers[section_name] then
|
||||||
local module_nodes, global_nodes, signals_name = section_scrapers[section_name](dl_html)
|
local module_nodes, global_nodes, signals_name = section_scrapers[section_name](dl_html, record_name)
|
||||||
for _, node in ipairs(module_nodes) do
|
for _, node in ipairs(module_nodes) do
|
||||||
table.insert(module_root.children, node)
|
table.insert(module_root.children, node)
|
||||||
end
|
end
|
||||||
|
|
|
@ -48,28 +48,16 @@ function scraper_utils.extract_nodes(html: string, query_selectors: { string }):
|
||||||
return siblings
|
return siblings
|
||||||
end
|
end
|
||||||
|
|
||||||
function scraper_utils.scrape_tuples<T>(html: string, query_selectors: { string }, extract_callback: function(tuple: { string : scan.HTMLNode | nil }): T): { T }
|
function scraper_utils.find(html: string, query_selector: string): { scan.HTMLNode }
|
||||||
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
local nodes: { scan.HTMLNode } = {}
|
||||||
|
|
||||||
local ret: { T } = {}
|
scanner.scan_html(html, function(stack: scan.NodeStack)
|
||||||
|
if stack:is(query_selector) then
|
||||||
for i = 1, #nodes:get(query_selectors[1]) do
|
table.insert(nodes, stack:current())
|
||||||
local node_list: { string : scan.HTMLNode | nil } = {}
|
end
|
||||||
tablex.foreach(query_selectors, function(query_selector: string)
|
|
||||||
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
|
||||||
end)
|
end)
|
||||||
local success, info_or_error = pcall(extract_callback, node_list)
|
|
||||||
|
|
||||||
if not success then
|
return nodes
|
||||||
local error_message = info_or_error as string
|
|
||||||
log:error(logger.message_with_metadata("Extraction error", { error = error_message }))
|
|
||||||
else
|
|
||||||
local info = info_or_error as T
|
|
||||||
table.insert(ret, info)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return ret
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function scraper_utils.iter_tuples(html: string, query_selectors: { string }): function(): { string : scan.HTMLNode }
|
function scraper_utils.iter_tuples(html: string, query_selectors: { string }): function(): { string : scan.HTMLNode }
|
||||||
|
|
Loading…
Reference in New Issue