scraper
This commit is contained in:
parent
3056d48d5a
commit
5b32f807b1
|
@ -1,4 +1,6 @@
|
||||||
local curl = require "cURL"
|
local curl = require "cURL"
|
||||||
|
local inspect = require "inspect"
|
||||||
|
local log = require "logger"
|
||||||
|
|
||||||
local crawler = {}
|
local crawler = {}
|
||||||
|
|
||||||
|
@ -17,7 +19,7 @@ function crawler.request(url)
|
||||||
local code, body = easy:getinfo_response_code(), table.concat(queue)
|
local code, body = easy:getinfo_response_code(), table.concat(queue)
|
||||||
easy:close()
|
easy:close()
|
||||||
|
|
||||||
if code ~= 200 then
|
if code < 200 and code >= 300 then
|
||||||
error {
|
error {
|
||||||
message = "curl response code is not 200",
|
message = "curl response code is not 200",
|
||||||
code = code,
|
code = code,
|
||||||
|
@ -28,4 +30,17 @@ function crawler.request(url)
|
||||||
return queue
|
return queue
|
||||||
end
|
end
|
||||||
|
|
||||||
|
function crawler.fetch(url)
|
||||||
|
local success, result = pcall(crawler.request, url)
|
||||||
|
|
||||||
|
if not success then
|
||||||
|
log:error(inspect { "Fetch failed", status = success, error = result })
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
log:info(inspect { message = "Successfully fetched resource", url = url })
|
||||||
|
|
||||||
|
return table.concat(result, "")
|
||||||
|
end
|
||||||
|
|
||||||
return crawler
|
return crawler
|
||||||
|
|
|
@ -1,19 +1,31 @@
|
||||||
|
local crawler = require "crawler"
|
||||||
local inspect = require "inspect"
|
local inspect = require "inspect"
|
||||||
local log = require "logger"
|
local log = require "logger"
|
||||||
local properties = require "properties"
|
local properties = require "properties"
|
||||||
local utils = require "utils"
|
local scraper = require "scraper"
|
||||||
|
|
||||||
log:info(
|
log:info(
|
||||||
inspect { message = "Start extraction", base_url = properties.base_url }
|
inspect { message = "Start extraction", base_url = properties.base_url }
|
||||||
)
|
)
|
||||||
|
|
||||||
local index = utils.fetch(properties.base_url .. properties.index_uri)
|
-- local index = crawler.fetch(properties.base_url .. properties.index_uri)
|
||||||
local modules = utils.get_modules_from_index(index, properties.ignored_modules)
|
-- local modules = scraper.get_modules_from_index(
|
||||||
|
-- index,
|
||||||
|
-- properties.ignored_modules
|
||||||
|
-- )
|
||||||
|
|
||||||
log:info(inspect { modules_found = #modules })
|
-- log:info(inspect { modules_found = #modules })
|
||||||
|
|
||||||
local m = modules[1]
|
-- for i = 1, 1 do -- #modules do
|
||||||
log:info(inspect { try = m })
|
-- local m = modules[i]
|
||||||
local page = utils.fetch(properties.base_url .. "/" .. m.uri)
|
-- log:info(inspect { try = m })
|
||||||
local items = utils.get_items_from_page(page)
|
-- local page = crawler.fetch(properties.base_url .. "/" .. m.uri)
|
||||||
|
-- local items = scraper.get_doc_from_page(page)
|
||||||
|
-- log:info(inspect { items })
|
||||||
|
-- end
|
||||||
|
|
||||||
|
local page = crawler.fetch(
|
||||||
|
properties.base_url .. "/widgets/awful.widget.button.html"
|
||||||
|
)
|
||||||
|
local items = scraper.get_doc_from_page(page)
|
||||||
log:info(inspect { items })
|
log:info(inspect { items })
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
local properties = {}
|
local properties = {}
|
||||||
|
|
||||||
properties.base_url = "https://awesomewm.org/apidoc"
|
-- properties.base_url = "https://awesomewm.org/apidoc"
|
||||||
|
properties.base_url = "file:///usr/share/doc/awesome/doc"
|
||||||
|
|
||||||
properties.index_uri = "/index.html"
|
properties.index_uri = "/index.html"
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,209 @@
|
||||||
|
local htmlparser = require "htmlparser"
|
||||||
|
local log = require "logger"
|
||||||
|
local utils = require "utils"
|
||||||
|
|
||||||
local scraper = {}
|
local scraper = {}
|
||||||
|
|
||||||
function scraper.extract_nodes(document, selector, extractor)
|
function scraper.get_modules_from_index(html, ignored)
|
||||||
local nodes = document(selector)
|
local document = htmlparser.parse(html)
|
||||||
local extracts = {}
|
|
||||||
|
|
||||||
for _, node in ipairs(nodes) do
|
local modules = utils.map(document "#navigation ul > li a", function(node)
|
||||||
local data = extractor(node)
|
return {
|
||||||
|
name = utils.sanitize_string(node:getcontent()),
|
||||||
|
uri = node.attributes.href,
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
|
||||||
if data then
|
local filtered_modules = utils.filter(modules, function(module)
|
||||||
table.insert(extracts, data)
|
return not utils.has_item(ignored, module.name)
|
||||||
|
end)
|
||||||
|
|
||||||
|
return filtered_modules
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_first_node_or_fail(node, selector)
|
||||||
|
local extracted = node(selector)[1]
|
||||||
|
|
||||||
|
if not extracted then
|
||||||
|
log:error {
|
||||||
|
message = "Can't find `" .. selector .. "` element!",
|
||||||
|
node = node:gettext(),
|
||||||
|
}
|
||||||
|
error "extract_first_node_or_fail"
|
||||||
|
end
|
||||||
|
|
||||||
|
return extracted
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_first_node(node, selector)
|
||||||
|
local extracted = node(selector)[1]
|
||||||
|
|
||||||
|
if not extracted then
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
return extracted
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_inner_floating_text(node)
|
||||||
|
local html = node:getcontent()
|
||||||
|
|
||||||
|
-- Remove inner tags from the html
|
||||||
|
for _, n in ipairs(node.nodes) do
|
||||||
|
html = utils.replace(html, n:gettext(), "")
|
||||||
|
end
|
||||||
|
|
||||||
|
return utils.sanitize_string(html)
|
||||||
|
end
|
||||||
|
|
||||||
|
local function extract_item_content_parameter_list(content)
|
||||||
|
for i, n in ipairs(content.nodes) do
|
||||||
|
-- The parameters <ul> is the next element after one of these <h3>
|
||||||
|
if
|
||||||
|
n.name == "h3"
|
||||||
|
and utils.has_item(
|
||||||
|
{ "Parameters:", "Type constraints:", "Arguments" },
|
||||||
|
utils.sanitize_string(n:getcontent())
|
||||||
|
)
|
||||||
|
then
|
||||||
|
return content.nodes[i + 1]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
return extracts
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_item_header_type(node)
|
||||||
|
local types_node = extract_first_node(node, "span.summary_type")
|
||||||
|
|
||||||
|
if not types_node then
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
local types_string = string.match(types_node:getcontent(), "%((.-)%)")
|
||||||
|
or types_node:getcontent()
|
||||||
|
types_string = utils.replace(types_string, "or", ",")
|
||||||
|
local splitted = string.gmatch(types_string, "([^,]+)")
|
||||||
|
|
||||||
|
local types = {}
|
||||||
|
for t in splitted do
|
||||||
|
table.insert(types, utils.sanitize_string(t))
|
||||||
|
end
|
||||||
|
|
||||||
|
return types
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_item_header(node)
|
||||||
|
local name = extract_first_node_or_fail(node, "strong")
|
||||||
|
local type = process_item_header_type(node)
|
||||||
|
|
||||||
|
local item_header = {
|
||||||
|
name = extract_inner_floating_text(name),
|
||||||
|
type = type,
|
||||||
|
}
|
||||||
|
|
||||||
|
return item_header
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_item_content_parameters(node)
|
||||||
|
local parameter = {}
|
||||||
|
|
||||||
|
local name = extract_first_node_or_fail(node, "span.parameter")
|
||||||
|
parameter.name = utils.sanitize_string(name:getcontent())
|
||||||
|
|
||||||
|
local nested_parameters_list = extract_first_node(node, "ul")
|
||||||
|
if nested_parameters_list then
|
||||||
|
parameter.type = "table"
|
||||||
|
parameter.nested_parameters = utils.map(
|
||||||
|
nested_parameters_list "* > li",
|
||||||
|
process_item_content_parameters
|
||||||
|
)
|
||||||
|
|
||||||
|
return parameter
|
||||||
|
end
|
||||||
|
|
||||||
|
local types_list = utils.map(node ".type", function(type)
|
||||||
|
return utils.sanitize_string(type:getcontent())
|
||||||
|
end)
|
||||||
|
if #types_list > 0 then
|
||||||
|
parameter.type = types_list
|
||||||
|
end
|
||||||
|
|
||||||
|
return parameter
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_item_content(content)
|
||||||
|
local parameters_list = extract_item_content_parameter_list(content)
|
||||||
|
local parameters = parameters_list
|
||||||
|
and utils.map(
|
||||||
|
parameters_list "* > li",
|
||||||
|
process_item_content_parameters
|
||||||
|
)
|
||||||
|
or {}
|
||||||
|
|
||||||
|
local item_content = {
|
||||||
|
parameters = #parameters > 0 and parameters or nil,
|
||||||
|
}
|
||||||
|
|
||||||
|
return item_content
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_section_titles(document)
|
||||||
|
return utils.map(document "h2.section-header", extract_inner_floating_text)
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_section_items(item)
|
||||||
|
local headers = utils.map(item "dt", process_item_header)
|
||||||
|
local contents = utils.map(item "dd", process_item_content)
|
||||||
|
|
||||||
|
if #headers ~= #contents then
|
||||||
|
log:error {
|
||||||
|
message = "extract_item_content failure: headers and contents don't have the same size",
|
||||||
|
headers = #headers,
|
||||||
|
contents = #contents,
|
||||||
|
}
|
||||||
|
error "extract_item_content"
|
||||||
|
end
|
||||||
|
|
||||||
|
local item_contents = utils.map(headers, function(header, i)
|
||||||
|
return {
|
||||||
|
name = header.name,
|
||||||
|
type = header.type,
|
||||||
|
parameters = contents[i].parameters,
|
||||||
|
}
|
||||||
|
end)
|
||||||
|
|
||||||
|
return item_contents
|
||||||
|
end
|
||||||
|
|
||||||
|
local function process_section_contents(document)
|
||||||
|
local section_items = document "dl.function"
|
||||||
|
|
||||||
|
local section_contents = utils.map(section_items, process_section_items)
|
||||||
|
|
||||||
|
return section_contents
|
||||||
|
end
|
||||||
|
|
||||||
|
function scraper.get_doc_from_page(html)
|
||||||
|
local document = htmlparser.parse(html, 9999)
|
||||||
|
|
||||||
|
local sections_titles = process_section_titles(document)
|
||||||
|
local section_contents = process_section_contents(document)
|
||||||
|
|
||||||
|
if #sections_titles ~= #section_contents then
|
||||||
|
log:error {
|
||||||
|
message = "get_items_from_page failure: section_titles and section_contents don't have the same size",
|
||||||
|
sections_titles = #sections_titles,
|
||||||
|
func = #section_contents,
|
||||||
|
}
|
||||||
|
error "get_items_from_page"
|
||||||
|
end
|
||||||
|
|
||||||
|
local doc = utils.map(sections_titles, function(title, i)
|
||||||
|
return { sections = title, items = section_contents[i] }
|
||||||
|
end)
|
||||||
|
|
||||||
|
return doc
|
||||||
end
|
end
|
||||||
|
|
||||||
return scraper
|
return scraper
|
||||||
|
|
|
@ -1,8 +1,3 @@
|
||||||
local crawler = require "crawler"
|
|
||||||
local htmlparser = require "htmlparser"
|
|
||||||
local inspect = require "inspect"
|
|
||||||
local log = require "logger"
|
|
||||||
local scraper = require "scraper"
|
|
||||||
local web_sanitize = require "web_sanitize"
|
local web_sanitize = require "web_sanitize"
|
||||||
|
|
||||||
local utils = {}
|
local utils = {}
|
||||||
|
@ -17,70 +12,47 @@ function utils.has_item(table, item)
|
||||||
return nil
|
return nil
|
||||||
end
|
end
|
||||||
|
|
||||||
function utils.sanitize_page_name(string)
|
function utils.filter(list, predicate)
|
||||||
return (web_sanitize.extract_text(string):gsub("^%s*(.-)%s*$", "%1"))
|
local filtered = {}
|
||||||
end
|
|
||||||
|
|
||||||
function utils.fetch(url)
|
for position, value in ipairs(list) do
|
||||||
local success, result = pcall(crawler.request, url)
|
if predicate(value, position) then
|
||||||
|
table.insert(filtered, value)
|
||||||
if not success then
|
end
|
||||||
log:error(inspect { "fetch failed", status = success, error = result })
|
|
||||||
return
|
|
||||||
end
|
end
|
||||||
|
|
||||||
log:info(inspect { message = "successfully fetched resource", url = url })
|
return filtered
|
||||||
|
|
||||||
return table.concat(result, "")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function utils.get_modules_from_index(html, ignored)
|
function utils.map(list, iteratee)
|
||||||
local document = htmlparser.parse(html)
|
local mapped = {}
|
||||||
|
|
||||||
local modules = scraper.extract_nodes(
|
for position, value in ipairs(list) do
|
||||||
document,
|
table.insert(mapped, iteratee(value, position))
|
||||||
"#navigation ul > li a",
|
end
|
||||||
function(node)
|
|
||||||
if node.name ~= "a" then
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
local name = utils.sanitize_page_name(node:getcontent())
|
return mapped
|
||||||
|
|
||||||
if utils.has_item(ignored, name) then
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
local module = {
|
|
||||||
name = name,
|
|
||||||
uri = node.attributes.href,
|
|
||||||
}
|
|
||||||
|
|
||||||
return module
|
|
||||||
end
|
|
||||||
)
|
|
||||||
|
|
||||||
return modules
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function utils.get_items_from_page(html)
|
function utils.sanitize_string(string)
|
||||||
local document = htmlparser.parse(html, 9999)
|
return utils.trim(
|
||||||
|
utils.replace(web_sanitize.extract_text(string), "^%s*(.-)%s*$", "%1")
|
||||||
local titles = scraper.extract_nodes(
|
|
||||||
document,
|
|
||||||
"h2.section-header",
|
|
||||||
function(node)
|
|
||||||
return {
|
|
||||||
name = node.name,
|
|
||||||
}
|
|
||||||
end
|
|
||||||
)
|
)
|
||||||
|
end
|
||||||
|
|
||||||
local items = scraper.extract_nodes(document, "dl.function", function(node)
|
-- Extracted from teh Penlight Lua library.
|
||||||
return { name = node.name }
|
-- Sometime Lua string.gsub can't match unescaped strings.
|
||||||
end)
|
-- https://stackoverflow.com/a/72666170
|
||||||
|
function utils.escape(string)
|
||||||
|
return (string:gsub("[%-%.%+%[%]%(%)%$%^%%%?%*]", "%%%1"))
|
||||||
|
end
|
||||||
|
|
||||||
return { titles, items }
|
function utils.replace(string, old, new, n)
|
||||||
|
return (string:gsub(utils.escape(old), new:gsub("%%", "%%%%"), n))
|
||||||
|
end
|
||||||
|
|
||||||
|
function utils.trim(string)
|
||||||
|
return string:match "^%s*(.-)%s*$"
|
||||||
end
|
end
|
||||||
|
|
||||||
return utils
|
return utils
|
||||||
|
|
Loading…
Reference in New Issue