Remove htmlparser dependence (#2) #8

Merged
Aire-One merged 4 commits from feat/#2 into master 2022-09-29 20:14:17 +02:00
3 changed files with 63 additions and 0 deletions
Showing only changes of commit 7285f020c0 - Show all commits

View File

@ -0,0 +1,10 @@
local class = require "pl.class"
local Module_Info = class.Module_Info()
function Module_Info:_init(name, uri)
self.name = name
self.uri = uri
end
return Module_Info

View File

@ -0,0 +1,28 @@
local Module_Info = require "entities.Module_Info"
local scraper_utils = require "scraper.utils"
local utils = require "utils"
local module = {}
local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
local function extract_module_info(node)
local name = utils.sanitize_string(node:inner_text())
local uri = node.attr.href
if not (name and uri) then
error("Can't extract module info from node: " .. node:outer_html())
end
return Module_Info(name, uri)
end
function module.get_modules_from_index(html)
return scraper_utils.scrape(
html,
MODULE_A_TAG_QUERY_SELECTOR,
extract_module_info
)
end
return module

View File

@ -0,0 +1,25 @@
local log = require "logger"
local scanner = require "web_sanitize.query.scan_html"
local scraper_utils = {}
function scraper_utils.scrape(html, query_selector, extract_callback)
local ret = {}
scanner.scan_html(html, function(stack)
if stack:is(query_selector) then
local node = stack:current()
local success, info = pcall(extract_callback, node)
if not success then
log:error { message = info }
else
table.insert(ret, info)
end
end
end)
return ret
end
return scraper_utils