feat(scraper): implement Module_Info scraper
This commit is contained in:
parent
3f8ebe8753
commit
7285f020c0
|
@ -0,0 +1,10 @@
|
|||
local class = require "pl.class"
|
||||
|
||||
local Module_Info = class.Module_Info()
|
||||
|
||||
function Module_Info:_init(name, uri)
|
||||
self.name = name
|
||||
self.uri = uri
|
||||
end
|
||||
|
||||
return Module_Info
|
|
@ -0,0 +1,28 @@
|
|||
local Module_Info = require "entities.Module_Info"
|
||||
local scraper_utils = require "scraper.utils"
|
||||
local utils = require "utils"
|
||||
|
||||
local module = {}
|
||||
|
||||
local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
|
||||
|
||||
local function extract_module_info(node)
|
||||
local name = utils.sanitize_string(node:inner_text())
|
||||
local uri = node.attr.href
|
||||
|
||||
if not (name and uri) then
|
||||
error("Can't extract module info from node: " .. node:outer_html())
|
||||
end
|
||||
|
||||
return Module_Info(name, uri)
|
||||
end
|
||||
|
||||
function module.get_modules_from_index(html)
|
||||
return scraper_utils.scrape(
|
||||
html,
|
||||
MODULE_A_TAG_QUERY_SELECTOR,
|
||||
extract_module_info
|
||||
)
|
||||
end
|
||||
|
||||
return module
|
|
@ -0,0 +1,25 @@
|
|||
local log = require "logger"
|
||||
local scanner = require "web_sanitize.query.scan_html"
|
||||
|
||||
local scraper_utils = {}
|
||||
|
||||
function scraper_utils.scrape(html, query_selector, extract_callback)
|
||||
local ret = {}
|
||||
|
||||
scanner.scan_html(html, function(stack)
|
||||
if stack:is(query_selector) then
|
||||
local node = stack:current()
|
||||
local success, info = pcall(extract_callback, node)
|
||||
|
||||
if not success then
|
||||
log:error { message = info }
|
||||
else
|
||||
table.insert(ret, info)
|
||||
end
|
||||
end
|
||||
end)
|
||||
|
||||
return ret
|
||||
end
|
||||
|
||||
return scraper_utils
|
Loading…
Reference in New Issue