feat(scraper): use the new scraper
This commit is contained in:
parent
ff446d3bea
commit
9dee667af2
|
@ -3,19 +3,19 @@ local inspect = require "inspect"
|
|||
local log = require "logger"
|
||||
local properties = require "properties"
|
||||
local scraper = require "scraper"
|
||||
local generator = require "generator"
|
||||
-- local generator = require "generator"
|
||||
|
||||
log:info(
|
||||
inspect { message = "Start extraction", base_url = properties.base_url }
|
||||
)
|
||||
|
||||
-- local index = crawler.fetch(properties.base_url .. properties.index_uri)
|
||||
-- local modules = scraper.get_modules_from_index(
|
||||
-- index,
|
||||
-- properties.ignored_modules
|
||||
-- )
|
||||
local index = crawler.fetch(properties.base_url .. properties.index_uri)
|
||||
|
||||
-- log:info(inspect { modules_found = #modules })
|
||||
-- local modules =
|
||||
-- scraper.get_modules_from_index(index, properties.ignored_modules)
|
||||
local module_infos = scraper.module_info_list.get_modules_from_index(index)
|
||||
|
||||
log:info(inspect { modules_found = #module_infos })
|
||||
|
||||
-- for i = 1, 1 do -- #modules do
|
||||
-- local m = modules[i]
|
||||
|
@ -25,12 +25,16 @@ log:info(
|
|||
-- log:info(inspect { items })
|
||||
-- end
|
||||
|
||||
local page =
|
||||
crawler.fetch(properties.base_url .. "/widgets/wibox.widget.imagebox.html")
|
||||
local items = scraper.get_doc_from_page(page)
|
||||
local html =
|
||||
crawler.fetch(properties.base_url .. "/widgets/wibox.widget.textbox.html")
|
||||
local module_doc = scraper.module_doc.get_doc_from_page(html)
|
||||
log:info(inspect { module_doc = module_doc })
|
||||
|
||||
-- local items = scraper.get_doc_from_page(page)
|
||||
-- log:info(inspect { items })
|
||||
|
||||
generator.write(
|
||||
generator.generate_teal(items),
|
||||
properties.out_directory .. "/test.tl"
|
||||
)
|
||||
|
||||
-- generator.write(
|
||||
-- generator.generate_teal(items),
|
||||
-- properties.out_directory .. "/test.tl"
|
||||
-- )
|
||||
|
|
|
@ -1,212 +1,5 @@
|
|||
local htmlparser = require "htmlparser"
|
||||
local log = require "logger"
|
||||
local utils = require "utils"
|
||||
|
||||
local scraper = {}
|
||||
|
||||
function scraper.get_modules_from_index(html, ignored)
|
||||
local document = htmlparser.parse(html)
|
||||
|
||||
local modules = utils.map(document "#navigation ul > li a", function(node)
|
||||
return {
|
||||
name = utils.sanitize_string(node:getcontent()),
|
||||
uri = node.attributes.href,
|
||||
}
|
||||
end)
|
||||
|
||||
local filtered_modules = utils.filter(modules, function(module)
|
||||
return not utils.has_item(ignored, module.name)
|
||||
end)
|
||||
|
||||
return filtered_modules
|
||||
end
|
||||
|
||||
local function extract_first_node_or_fail(node, selector)
|
||||
local extracted = node(selector)[1]
|
||||
|
||||
if not extracted then
|
||||
log:error {
|
||||
message = "Can't find `" .. selector .. "` element!",
|
||||
node = node:gettext(),
|
||||
}
|
||||
error "extract_first_node_or_fail"
|
||||
end
|
||||
|
||||
return extracted
|
||||
end
|
||||
|
||||
local function extract_first_node(node, selector)
|
||||
local extracted = node(selector)[1]
|
||||
|
||||
if not extracted then
|
||||
return nil
|
||||
end
|
||||
|
||||
return extracted
|
||||
end
|
||||
|
||||
local function extract_inner_floating_text(node)
|
||||
local html = node:getcontent()
|
||||
|
||||
-- Remove inner tags from the html
|
||||
for _, n in ipairs(node.nodes) do
|
||||
html = utils.replace(html, n:gettext(), "")
|
||||
end
|
||||
|
||||
return utils.sanitize_string(html)
|
||||
end
|
||||
|
||||
local function extract_item_content_parameter_list(content)
|
||||
for i, n in ipairs(content.nodes) do
|
||||
-- The parameters <ul> is the next element after one of these <h3>
|
||||
if
|
||||
n.name == "h3"
|
||||
and utils.has_item(
|
||||
{ "Parameters:", "Type constraints:", "Arguments" },
|
||||
utils.sanitize_string(n:getcontent())
|
||||
)
|
||||
then
|
||||
return content.nodes[i + 1]
|
||||
end
|
||||
end
|
||||
|
||||
return nil
|
||||
end
|
||||
|
||||
local function process_item_header_type(node)
|
||||
local types_node = extract_first_node(node, "span.summary_type")
|
||||
|
||||
if not types_node then
|
||||
return nil
|
||||
end
|
||||
|
||||
local types_string = string.match(types_node:getcontent(), "%((.-)%)")
|
||||
or types_node:getcontent()
|
||||
types_string = utils.replace(types_string, "or", ",")
|
||||
local splitted = string.gmatch(types_string, "([^,]+)")
|
||||
|
||||
local types = {}
|
||||
for t in splitted do
|
||||
table.insert(types, utils.sanitize_string(t))
|
||||
end
|
||||
|
||||
return types
|
||||
end
|
||||
|
||||
local function process_item_header(node)
|
||||
local name_node = extract_first_node_or_fail(node, "strong")
|
||||
|
||||
-- Remove starting ":" character on method name
|
||||
local name = extract_inner_floating_text(name_node):gsub("^:", "")
|
||||
local type = process_item_header_type(node)
|
||||
|
||||
local item_header = {
|
||||
name = name,
|
||||
type = type,
|
||||
}
|
||||
|
||||
return item_header
|
||||
end
|
||||
|
||||
local function process_item_content_parameters(node)
|
||||
local parameter = {}
|
||||
|
||||
local name = extract_first_node_or_fail(node, "span.parameter")
|
||||
parameter.name = utils.sanitize_string(name:getcontent())
|
||||
|
||||
local nested_parameters_list = extract_first_node(node, "ul")
|
||||
if nested_parameters_list then
|
||||
parameter.type = "table"
|
||||
parameter.nested_parameters = utils.map(
|
||||
nested_parameters_list "* > li",
|
||||
process_item_content_parameters
|
||||
)
|
||||
|
||||
return parameter
|
||||
end
|
||||
|
||||
local types_list = utils.map(node ".type", function(type)
|
||||
return utils.sanitize_string(type:getcontent())
|
||||
end)
|
||||
if #types_list > 0 then
|
||||
parameter.type = types_list
|
||||
end
|
||||
|
||||
return parameter
|
||||
end
|
||||
|
||||
local function process_item_content(content)
|
||||
local parameters_list = extract_item_content_parameter_list(content)
|
||||
local parameters = parameters_list
|
||||
and utils.map(
|
||||
parameters_list "* > li",
|
||||
process_item_content_parameters
|
||||
)
|
||||
or {}
|
||||
|
||||
local item_content = {
|
||||
parameters = #parameters > 0 and parameters or nil,
|
||||
}
|
||||
|
||||
return item_content
|
||||
end
|
||||
|
||||
local function process_section_titles(document)
|
||||
return utils.map(document "h2.section-header", extract_inner_floating_text)
|
||||
end
|
||||
|
||||
local function process_section_items(item)
|
||||
local headers = utils.map(item "dt", process_item_header)
|
||||
local contents = utils.map(item "dd", process_item_content)
|
||||
|
||||
if #headers ~= #contents then
|
||||
log:error {
|
||||
message = "extract_item_content failure: headers and contents don't have the same size",
|
||||
headers = #headers,
|
||||
contents = #contents,
|
||||
}
|
||||
error "extract_item_content"
|
||||
end
|
||||
|
||||
local item_contents = utils.map(headers, function(header, i)
|
||||
return {
|
||||
name = header.name,
|
||||
type = header.type,
|
||||
parameters = contents[i].parameters,
|
||||
}
|
||||
end)
|
||||
|
||||
return item_contents
|
||||
end
|
||||
|
||||
local function process_section_contents(document)
|
||||
local section_items = document "dl.function"
|
||||
|
||||
local section_contents = utils.map(section_items, process_section_items)
|
||||
|
||||
return section_contents
|
||||
end
|
||||
|
||||
function scraper.get_doc_from_page(html)
|
||||
local document = htmlparser.parse(html, 9999)
|
||||
|
||||
local sections_titles = process_section_titles(document)
|
||||
local section_contents = process_section_contents(document)
|
||||
|
||||
if #sections_titles ~= #section_contents then
|
||||
log:error {
|
||||
message = "get_items_from_page failure: section_titles and section_contents don't have the same size",
|
||||
sections_titles = #sections_titles,
|
||||
func = #section_contents,
|
||||
}
|
||||
error "get_items_from_page"
|
||||
end
|
||||
|
||||
local doc = utils.map(sections_titles, function(title, i)
|
||||
return { section = title, items = section_contents[i] }
|
||||
end)
|
||||
|
||||
return doc
|
||||
end
|
||||
|
||||
return scraper
|
||||
return {
|
||||
module_doc = require "scraper.module_doc",
|
||||
module_info_list = require "scraper.module_info_list",
|
||||
utils = require "scraper.utils",
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue