feat(scraper): implement basic Module_Doc scraper
This commit is contained in:
parent
7285f020c0
commit
ff446d3bea
|
@ -0,0 +1,23 @@
|
||||||
|
local class = require "pl.class"
|
||||||
|
local List = require "pl.List"
|
||||||
|
|
||||||
|
local Function_Info = class.Module_Doc()
|
||||||
|
|
||||||
|
function Function_Info:_init()
|
||||||
|
self.name = ""
|
||||||
|
self.parameters = List()
|
||||||
|
self.return_types = List()
|
||||||
|
end
|
||||||
|
|
||||||
|
function Function_Info:append_return_type(return_type)
|
||||||
|
self.return_types:append(return_type)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Function_Info:append_parameter(name, type)
|
||||||
|
self.parameters:append {
|
||||||
|
name = name,
|
||||||
|
type = type,
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
return Function_Info
|
|
@ -0,0 +1,12 @@
|
||||||
|
local class = require "pl.class"
|
||||||
|
local List = require "pl.List"
|
||||||
|
|
||||||
|
local Module_Doc = class.Module_Doc()
|
||||||
|
|
||||||
|
function Module_Doc:_init()
|
||||||
|
self.constructors = List()
|
||||||
|
self.methods = List()
|
||||||
|
self.static_functions = List()
|
||||||
|
end
|
||||||
|
|
||||||
|
return Module_Doc
|
|
@ -0,0 +1,84 @@
|
||||||
|
local Function_Info = require "entities.Function_Info"
|
||||||
|
local Module_Doc = require "entities.Module_Doc"
|
||||||
|
local scraper_utils = require "scraper.utils"
|
||||||
|
local utils = require "utils"
|
||||||
|
|
||||||
|
local function extract_function_name(function_name_node)
|
||||||
|
return function_name_node and (function_name_node.attr.name:gsub(".*:", ""))
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_function_return_types(function_return_types_node)
|
||||||
|
if not function_return_types_node then
|
||||||
|
return {}
|
||||||
|
end
|
||||||
|
|
||||||
|
local selector = "span.types .type"
|
||||||
|
local html = function_return_types_node:outer_html()
|
||||||
|
|
||||||
|
return scraper_utils.scrape(html, selector, function(node)
|
||||||
|
return utils.sanitize_string(node:inner_text())
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_section_functions(dl)
|
||||||
|
local query_selectors = {
|
||||||
|
function_name = "dt a",
|
||||||
|
function_return_type = "dd ol",
|
||||||
|
}
|
||||||
|
|
||||||
|
return scraper_utils.scrape_tuples(
|
||||||
|
dl,
|
||||||
|
{ query_selectors.function_name, query_selectors.function_return_type },
|
||||||
|
function(nodes)
|
||||||
|
local function_info = Function_Info()
|
||||||
|
|
||||||
|
function_info.name =
|
||||||
|
extract_function_name(nodes[query_selectors.function_name])
|
||||||
|
function_info.return_types = extract_function_return_types(
|
||||||
|
nodes[query_selectors.function_return_type]
|
||||||
|
)
|
||||||
|
|
||||||
|
return function_info
|
||||||
|
end
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
local module = {}
|
||||||
|
|
||||||
|
function module.get_doc_from_page(html)
|
||||||
|
local nodes = scraper_utils.extract_nodes(html, {
|
||||||
|
"h2.section-header",
|
||||||
|
"dl.function",
|
||||||
|
})
|
||||||
|
|
||||||
|
if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then
|
||||||
|
error "The list aren't the same size!"
|
||||||
|
end
|
||||||
|
|
||||||
|
local module_doc = Module_Doc()
|
||||||
|
|
||||||
|
for i, h2 in ipairs(nodes:get "h2.section-header") do
|
||||||
|
local section_name = utils.sanitize_string(h2:inner_text())
|
||||||
|
local dl_html = nodes:get("dl.function")[i]:outer_html()
|
||||||
|
|
||||||
|
if section_name == "Constructors" then
|
||||||
|
module_doc.constructors = extract_section_functions(dl_html)
|
||||||
|
elseif section_name == "Static module functions" then
|
||||||
|
module_doc.static_functions = extract_section_functions(dl_html)
|
||||||
|
elseif section_name == "Object properties" then
|
||||||
|
print "Not implemented: Deprecated object properties"
|
||||||
|
elseif section_name == "Deprecated object properties" then
|
||||||
|
print "Not implemented: Deprecated object properties"
|
||||||
|
elseif section_name == "Object methods" then
|
||||||
|
module_doc.methods = extract_section_functions(dl_html)
|
||||||
|
elseif section_name == "Signals" then
|
||||||
|
print "Not implemented: Signals"
|
||||||
|
else
|
||||||
|
error("Unknown section name: " .. section_name)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return module_doc
|
||||||
|
end
|
||||||
|
|
||||||
|
return module
|
|
@ -1,5 +1,8 @@
|
||||||
|
local List = require "pl.List"
|
||||||
local log = require "logger"
|
local log = require "logger"
|
||||||
|
local Map = require "pl.Map"
|
||||||
local scanner = require "web_sanitize.query.scan_html"
|
local scanner = require "web_sanitize.query.scan_html"
|
||||||
|
local tablex = require "pl.tablex"
|
||||||
|
|
||||||
local scraper_utils = {}
|
local scraper_utils = {}
|
||||||
|
|
||||||
|
@ -22,4 +25,44 @@ function scraper_utils.scrape(html, query_selector, extract_callback)
|
||||||
return ret
|
return ret
|
||||||
end
|
end
|
||||||
|
|
||||||
|
function scraper_utils.extract_nodes(html, query_selectors)
|
||||||
|
local siblings = Map()
|
||||||
|
|
||||||
|
tablex.foreach(query_selectors, function(query_selector)
|
||||||
|
siblings:set(query_selector, List())
|
||||||
|
end)
|
||||||
|
|
||||||
|
scanner.scan_html(html, function(stack)
|
||||||
|
tablex.foreach(query_selectors, function(query_selector)
|
||||||
|
if stack:is(query_selector) then
|
||||||
|
siblings:get(query_selector):append(stack:current())
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
end)
|
||||||
|
|
||||||
|
return siblings
|
||||||
|
end
|
||||||
|
|
||||||
|
function scraper_utils.scrape_tuples(html, query_selectors, extract_callback)
|
||||||
|
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
||||||
|
|
||||||
|
local ret = {}
|
||||||
|
|
||||||
|
for i = 1, #nodes:get(query_selectors[1]) do
|
||||||
|
local node_list = {}
|
||||||
|
tablex.foreach(query_selectors, function(query_selector)
|
||||||
|
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
||||||
|
end)
|
||||||
|
local success, info = pcall(extract_callback, node_list)
|
||||||
|
|
||||||
|
if not success then
|
||||||
|
log:error { message = info }
|
||||||
|
else
|
||||||
|
table.insert(ret, info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
return scraper_utils
|
return scraper_utils
|
||||||
|
|
Loading…
Reference in New Issue