Merge pull request 'Remove htmlparser dependence (#2)' (#8) from feat/#2 into master
ci/woodpecker/push/lint Pipeline was successful
Details
ci/woodpecker/push/lint Pipeline was successful
Details
Reviewed-on: #8
This commit is contained in:
commit
f01ad65e05
|
@ -12,7 +12,6 @@ dependencies = {
|
||||||
"lualogging 1.6.0",
|
"lualogging 1.6.0",
|
||||||
"inspect 3.1.3",
|
"inspect 3.1.3",
|
||||||
"ansicolors 1.0.2",
|
"ansicolors 1.0.2",
|
||||||
"htmlparser 0.3.9",
|
|
||||||
"web_sanitize 1.3.0",
|
"web_sanitize 1.3.0",
|
||||||
"penlight 1.13.1",
|
"penlight 1.13.1",
|
||||||
"luasocket 3.1.0-1",
|
"luasocket 3.1.0-1",
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
local class = require "pl.class"
|
||||||
|
local List = require "pl.List"
|
||||||
|
|
||||||
|
local Function_Info = class.Module_Doc()
|
||||||
|
|
||||||
|
function Function_Info:_init()
|
||||||
|
self.name = ""
|
||||||
|
self.parameters = List()
|
||||||
|
self.return_types = List()
|
||||||
|
end
|
||||||
|
|
||||||
|
function Function_Info:append_return_type(return_type)
|
||||||
|
self.return_types:append(return_type)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Function_Info:append_parameter(name, type)
|
||||||
|
self.parameters:append {
|
||||||
|
name = name,
|
||||||
|
type = type,
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
return Function_Info
|
|
@ -0,0 +1,12 @@
|
||||||
|
local class = require "pl.class"
|
||||||
|
local List = require "pl.List"
|
||||||
|
|
||||||
|
local Module_Doc = class.Module_Doc()
|
||||||
|
|
||||||
|
function Module_Doc:_init()
|
||||||
|
self.constructors = List()
|
||||||
|
self.methods = List()
|
||||||
|
self.static_functions = List()
|
||||||
|
end
|
||||||
|
|
||||||
|
return Module_Doc
|
|
@ -0,0 +1,10 @@
|
||||||
|
local class = require "pl.class"
|
||||||
|
|
||||||
|
local Module_Info = class.Module_Info()
|
||||||
|
|
||||||
|
function Module_Info:_init(name, uri)
|
||||||
|
self.name = name
|
||||||
|
self.uri = uri
|
||||||
|
end
|
||||||
|
|
||||||
|
return Module_Info
|
|
@ -3,19 +3,19 @@ local inspect = require "inspect"
|
||||||
local log = require "logger"
|
local log = require "logger"
|
||||||
local properties = require "properties"
|
local properties = require "properties"
|
||||||
local scraper = require "scraper"
|
local scraper = require "scraper"
|
||||||
local generator = require "generator"
|
-- local generator = require "generator"
|
||||||
|
|
||||||
log:info(
|
log:info(
|
||||||
inspect { message = "Start extraction", base_url = properties.base_url }
|
inspect { message = "Start extraction", base_url = properties.base_url }
|
||||||
)
|
)
|
||||||
|
|
||||||
-- local index = crawler.fetch(properties.base_url .. properties.index_uri)
|
local index = crawler.fetch(properties.base_url .. properties.index_uri)
|
||||||
-- local modules = scraper.get_modules_from_index(
|
|
||||||
-- index,
|
|
||||||
-- properties.ignored_modules
|
|
||||||
-- )
|
|
||||||
|
|
||||||
-- log:info(inspect { modules_found = #modules })
|
-- local modules =
|
||||||
|
-- scraper.get_modules_from_index(index, properties.ignored_modules)
|
||||||
|
local module_infos = scraper.module_info_list.get_modules_from_index(index)
|
||||||
|
|
||||||
|
log:info(inspect { modules_found = #module_infos })
|
||||||
|
|
||||||
-- for i = 1, 1 do -- #modules do
|
-- for i = 1, 1 do -- #modules do
|
||||||
-- local m = modules[i]
|
-- local m = modules[i]
|
||||||
|
@ -25,12 +25,15 @@ log:info(
|
||||||
-- log:info(inspect { items })
|
-- log:info(inspect { items })
|
||||||
-- end
|
-- end
|
||||||
|
|
||||||
local page =
|
local html =
|
||||||
crawler.fetch(properties.base_url .. "/widgets/wibox.widget.imagebox.html")
|
crawler.fetch(properties.base_url .. "/widgets/wibox.widget.textbox.html")
|
||||||
local items = scraper.get_doc_from_page(page)
|
local module_doc = scraper.module_doc.get_doc_from_page(html)
|
||||||
|
log:info(inspect { module_doc = module_doc })
|
||||||
|
|
||||||
|
-- local items = scraper.get_doc_from_page(page)
|
||||||
-- log:info(inspect { items })
|
-- log:info(inspect { items })
|
||||||
|
|
||||||
generator.write(
|
-- generator.write(
|
||||||
generator.generate_teal(items),
|
-- generator.generate_teal(items),
|
||||||
properties.out_directory .. "/test.tl"
|
-- properties.out_directory .. "/test.tl"
|
||||||
)
|
-- )
|
||||||
|
|
|
@ -1,212 +1,5 @@
|
||||||
local htmlparser = require "htmlparser"
|
|
||||||
local log = require "logger"
|
|
||||||
local utils = require "utils"
|
|
||||||
|
|
||||||
local scraper = {}
|
|
||||||
|
|
||||||
function scraper.get_modules_from_index(html, ignored)
|
|
||||||
local document = htmlparser.parse(html)
|
|
||||||
|
|
||||||
local modules = utils.map(document "#navigation ul > li a", function(node)
|
|
||||||
return {
|
return {
|
||||||
name = utils.sanitize_string(node:getcontent()),
|
module_doc = require "scraper.module_doc",
|
||||||
uri = node.attributes.href,
|
module_info_list = require "scraper.module_info_list",
|
||||||
|
utils = require "scraper.utils",
|
||||||
}
|
}
|
||||||
end)
|
|
||||||
|
|
||||||
local filtered_modules = utils.filter(modules, function(module)
|
|
||||||
return not utils.has_item(ignored, module.name)
|
|
||||||
end)
|
|
||||||
|
|
||||||
return filtered_modules
|
|
||||||
end
|
|
||||||
|
|
||||||
local function extract_first_node_or_fail(node, selector)
|
|
||||||
local extracted = node(selector)[1]
|
|
||||||
|
|
||||||
if not extracted then
|
|
||||||
log:error {
|
|
||||||
message = "Can't find `" .. selector .. "` element!",
|
|
||||||
node = node:gettext(),
|
|
||||||
}
|
|
||||||
error "extract_first_node_or_fail"
|
|
||||||
end
|
|
||||||
|
|
||||||
return extracted
|
|
||||||
end
|
|
||||||
|
|
||||||
local function extract_first_node(node, selector)
|
|
||||||
local extracted = node(selector)[1]
|
|
||||||
|
|
||||||
if not extracted then
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
return extracted
|
|
||||||
end
|
|
||||||
|
|
||||||
local function extract_inner_floating_text(node)
|
|
||||||
local html = node:getcontent()
|
|
||||||
|
|
||||||
-- Remove inner tags from the html
|
|
||||||
for _, n in ipairs(node.nodes) do
|
|
||||||
html = utils.replace(html, n:gettext(), "")
|
|
||||||
end
|
|
||||||
|
|
||||||
return utils.sanitize_string(html)
|
|
||||||
end
|
|
||||||
|
|
||||||
local function extract_item_content_parameter_list(content)
|
|
||||||
for i, n in ipairs(content.nodes) do
|
|
||||||
-- The parameters <ul> is the next element after one of these <h3>
|
|
||||||
if
|
|
||||||
n.name == "h3"
|
|
||||||
and utils.has_item(
|
|
||||||
{ "Parameters:", "Type constraints:", "Arguments" },
|
|
||||||
utils.sanitize_string(n:getcontent())
|
|
||||||
)
|
|
||||||
then
|
|
||||||
return content.nodes[i + 1]
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_item_header_type(node)
|
|
||||||
local types_node = extract_first_node(node, "span.summary_type")
|
|
||||||
|
|
||||||
if not types_node then
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
local types_string = string.match(types_node:getcontent(), "%((.-)%)")
|
|
||||||
or types_node:getcontent()
|
|
||||||
types_string = utils.replace(types_string, "or", ",")
|
|
||||||
local splitted = string.gmatch(types_string, "([^,]+)")
|
|
||||||
|
|
||||||
local types = {}
|
|
||||||
for t in splitted do
|
|
||||||
table.insert(types, utils.sanitize_string(t))
|
|
||||||
end
|
|
||||||
|
|
||||||
return types
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_item_header(node)
|
|
||||||
local name_node = extract_first_node_or_fail(node, "strong")
|
|
||||||
|
|
||||||
-- Remove starting ":" character on method name
|
|
||||||
local name = extract_inner_floating_text(name_node):gsub("^:", "")
|
|
||||||
local type = process_item_header_type(node)
|
|
||||||
|
|
||||||
local item_header = {
|
|
||||||
name = name,
|
|
||||||
type = type,
|
|
||||||
}
|
|
||||||
|
|
||||||
return item_header
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_item_content_parameters(node)
|
|
||||||
local parameter = {}
|
|
||||||
|
|
||||||
local name = extract_first_node_or_fail(node, "span.parameter")
|
|
||||||
parameter.name = utils.sanitize_string(name:getcontent())
|
|
||||||
|
|
||||||
local nested_parameters_list = extract_first_node(node, "ul")
|
|
||||||
if nested_parameters_list then
|
|
||||||
parameter.type = "table"
|
|
||||||
parameter.nested_parameters = utils.map(
|
|
||||||
nested_parameters_list "* > li",
|
|
||||||
process_item_content_parameters
|
|
||||||
)
|
|
||||||
|
|
||||||
return parameter
|
|
||||||
end
|
|
||||||
|
|
||||||
local types_list = utils.map(node ".type", function(type)
|
|
||||||
return utils.sanitize_string(type:getcontent())
|
|
||||||
end)
|
|
||||||
if #types_list > 0 then
|
|
||||||
parameter.type = types_list
|
|
||||||
end
|
|
||||||
|
|
||||||
return parameter
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_item_content(content)
|
|
||||||
local parameters_list = extract_item_content_parameter_list(content)
|
|
||||||
local parameters = parameters_list
|
|
||||||
and utils.map(
|
|
||||||
parameters_list "* > li",
|
|
||||||
process_item_content_parameters
|
|
||||||
)
|
|
||||||
or {}
|
|
||||||
|
|
||||||
local item_content = {
|
|
||||||
parameters = #parameters > 0 and parameters or nil,
|
|
||||||
}
|
|
||||||
|
|
||||||
return item_content
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_section_titles(document)
|
|
||||||
return utils.map(document "h2.section-header", extract_inner_floating_text)
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_section_items(item)
|
|
||||||
local headers = utils.map(item "dt", process_item_header)
|
|
||||||
local contents = utils.map(item "dd", process_item_content)
|
|
||||||
|
|
||||||
if #headers ~= #contents then
|
|
||||||
log:error {
|
|
||||||
message = "extract_item_content failure: headers and contents don't have the same size",
|
|
||||||
headers = #headers,
|
|
||||||
contents = #contents,
|
|
||||||
}
|
|
||||||
error "extract_item_content"
|
|
||||||
end
|
|
||||||
|
|
||||||
local item_contents = utils.map(headers, function(header, i)
|
|
||||||
return {
|
|
||||||
name = header.name,
|
|
||||||
type = header.type,
|
|
||||||
parameters = contents[i].parameters,
|
|
||||||
}
|
|
||||||
end)
|
|
||||||
|
|
||||||
return item_contents
|
|
||||||
end
|
|
||||||
|
|
||||||
local function process_section_contents(document)
|
|
||||||
local section_items = document "dl.function"
|
|
||||||
|
|
||||||
local section_contents = utils.map(section_items, process_section_items)
|
|
||||||
|
|
||||||
return section_contents
|
|
||||||
end
|
|
||||||
|
|
||||||
function scraper.get_doc_from_page(html)
|
|
||||||
local document = htmlparser.parse(html, 9999)
|
|
||||||
|
|
||||||
local sections_titles = process_section_titles(document)
|
|
||||||
local section_contents = process_section_contents(document)
|
|
||||||
|
|
||||||
if #sections_titles ~= #section_contents then
|
|
||||||
log:error {
|
|
||||||
message = "get_items_from_page failure: section_titles and section_contents don't have the same size",
|
|
||||||
sections_titles = #sections_titles,
|
|
||||||
func = #section_contents,
|
|
||||||
}
|
|
||||||
error "get_items_from_page"
|
|
||||||
end
|
|
||||||
|
|
||||||
local doc = utils.map(sections_titles, function(title, i)
|
|
||||||
return { section = title, items = section_contents[i] }
|
|
||||||
end)
|
|
||||||
|
|
||||||
return doc
|
|
||||||
end
|
|
||||||
|
|
||||||
return scraper
|
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
local Function_Info = require "entities.Function_Info"
|
||||||
|
local Module_Doc = require "entities.Module_Doc"
|
||||||
|
local scraper_utils = require "scraper.utils"
|
||||||
|
local utils = require "utils"
|
||||||
|
|
||||||
|
local function extract_function_name(function_name_node)
|
||||||
|
return function_name_node and (function_name_node.attr.name:gsub(".*:", ""))
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_function_return_types(function_return_types_node)
|
||||||
|
if not function_return_types_node then
|
||||||
|
return {}
|
||||||
|
end
|
||||||
|
|
||||||
|
local selector = "span.types .type"
|
||||||
|
local html = function_return_types_node:outer_html()
|
||||||
|
|
||||||
|
return scraper_utils.scrape(html, selector, function(node)
|
||||||
|
return utils.sanitize_string(node:inner_text())
|
||||||
|
end)
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_section_functions(dl)
|
||||||
|
local query_selectors = {
|
||||||
|
function_name = "dt a",
|
||||||
|
function_return_type = "dd ol",
|
||||||
|
}
|
||||||
|
|
||||||
|
return scraper_utils.scrape_tuples(
|
||||||
|
dl,
|
||||||
|
{ query_selectors.function_name, query_selectors.function_return_type },
|
||||||
|
function(nodes)
|
||||||
|
local function_info = Function_Info()
|
||||||
|
|
||||||
|
function_info.name =
|
||||||
|
extract_function_name(nodes[query_selectors.function_name])
|
||||||
|
function_info.return_types = extract_function_return_types(
|
||||||
|
nodes[query_selectors.function_return_type]
|
||||||
|
)
|
||||||
|
|
||||||
|
return function_info
|
||||||
|
end
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
local module = {}
|
||||||
|
|
||||||
|
function module.get_doc_from_page(html)
|
||||||
|
local nodes = scraper_utils.extract_nodes(html, {
|
||||||
|
"h2.section-header",
|
||||||
|
"dl.function",
|
||||||
|
})
|
||||||
|
|
||||||
|
if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then
|
||||||
|
error "The list aren't the same size!"
|
||||||
|
end
|
||||||
|
|
||||||
|
local module_doc = Module_Doc()
|
||||||
|
|
||||||
|
for i, h2 in ipairs(nodes:get "h2.section-header") do
|
||||||
|
local section_name = utils.sanitize_string(h2:inner_text())
|
||||||
|
local dl_html = nodes:get("dl.function")[i]:outer_html()
|
||||||
|
|
||||||
|
if section_name == "Constructors" then
|
||||||
|
module_doc.constructors = extract_section_functions(dl_html)
|
||||||
|
elseif section_name == "Static module functions" then
|
||||||
|
module_doc.static_functions = extract_section_functions(dl_html)
|
||||||
|
elseif section_name == "Object properties" then
|
||||||
|
print "Not implemented: Deprecated object properties"
|
||||||
|
elseif section_name == "Deprecated object properties" then
|
||||||
|
print "Not implemented: Deprecated object properties"
|
||||||
|
elseif section_name == "Object methods" then
|
||||||
|
module_doc.methods = extract_section_functions(dl_html)
|
||||||
|
elseif section_name == "Signals" then
|
||||||
|
print "Not implemented: Signals"
|
||||||
|
else
|
||||||
|
error("Unknown section name: " .. section_name)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return module_doc
|
||||||
|
end
|
||||||
|
|
||||||
|
return module
|
|
@ -0,0 +1,28 @@
|
||||||
|
local Module_Info = require "entities.Module_Info"
|
||||||
|
local scraper_utils = require "scraper.utils"
|
||||||
|
local utils = require "utils"
|
||||||
|
|
||||||
|
local module = {}
|
||||||
|
|
||||||
|
local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
|
||||||
|
|
||||||
|
local function extract_module_info(node)
|
||||||
|
local name = utils.sanitize_string(node:inner_text())
|
||||||
|
local uri = node.attr.href
|
||||||
|
|
||||||
|
if not (name and uri) then
|
||||||
|
error("Can't extract module info from node: " .. node:outer_html())
|
||||||
|
end
|
||||||
|
|
||||||
|
return Module_Info(name, uri)
|
||||||
|
end
|
||||||
|
|
||||||
|
function module.get_modules_from_index(html)
|
||||||
|
return scraper_utils.scrape(
|
||||||
|
html,
|
||||||
|
MODULE_A_TAG_QUERY_SELECTOR,
|
||||||
|
extract_module_info
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
return module
|
|
@ -0,0 +1,68 @@
|
||||||
|
local List = require "pl.List"
|
||||||
|
local log = require "logger"
|
||||||
|
local Map = require "pl.Map"
|
||||||
|
local scanner = require "web_sanitize.query.scan_html"
|
||||||
|
local tablex = require "pl.tablex"
|
||||||
|
|
||||||
|
local scraper_utils = {}
|
||||||
|
|
||||||
|
function scraper_utils.scrape(html, query_selector, extract_callback)
|
||||||
|
local ret = {}
|
||||||
|
|
||||||
|
scanner.scan_html(html, function(stack)
|
||||||
|
if stack:is(query_selector) then
|
||||||
|
local node = stack:current()
|
||||||
|
local success, info = pcall(extract_callback, node)
|
||||||
|
|
||||||
|
if not success then
|
||||||
|
log:error { message = info }
|
||||||
|
else
|
||||||
|
table.insert(ret, info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
function scraper_utils.extract_nodes(html, query_selectors)
|
||||||
|
local siblings = Map()
|
||||||
|
|
||||||
|
tablex.foreach(query_selectors, function(query_selector)
|
||||||
|
siblings:set(query_selector, List())
|
||||||
|
end)
|
||||||
|
|
||||||
|
scanner.scan_html(html, function(stack)
|
||||||
|
tablex.foreach(query_selectors, function(query_selector)
|
||||||
|
if stack:is(query_selector) then
|
||||||
|
siblings:get(query_selector):append(stack:current())
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
end)
|
||||||
|
|
||||||
|
return siblings
|
||||||
|
end
|
||||||
|
|
||||||
|
function scraper_utils.scrape_tuples(html, query_selectors, extract_callback)
|
||||||
|
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
||||||
|
|
||||||
|
local ret = {}
|
||||||
|
|
||||||
|
for i = 1, #nodes:get(query_selectors[1]) do
|
||||||
|
local node_list = {}
|
||||||
|
tablex.foreach(query_selectors, function(query_selector)
|
||||||
|
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
||||||
|
end)
|
||||||
|
local success, info = pcall(extract_callback, node_list)
|
||||||
|
|
||||||
|
if not success then
|
||||||
|
log:error { message = info }
|
||||||
|
else
|
||||||
|
table.insert(ret, info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
return scraper_utils
|
Loading…
Reference in New Issue