From 7285f020c0df280da8fa7d8658f987fa9be2e051 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Wed, 21 Sep 2022 23:41:27 +0200 Subject: [PATCH] feat(scraper): implement Module_Info scraper --- src/awesomewm.d.tl/entities/Module_Info.lua | 10 +++++++ .../scraper/module_info_list.lua | 28 +++++++++++++++++++ src/awesomewm.d.tl/scraper/utils.lua | 25 +++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 src/awesomewm.d.tl/entities/Module_Info.lua create mode 100644 src/awesomewm.d.tl/scraper/module_info_list.lua create mode 100644 src/awesomewm.d.tl/scraper/utils.lua diff --git a/src/awesomewm.d.tl/entities/Module_Info.lua b/src/awesomewm.d.tl/entities/Module_Info.lua new file mode 100644 index 0000000..df163d9 --- /dev/null +++ b/src/awesomewm.d.tl/entities/Module_Info.lua @@ -0,0 +1,10 @@ +local class = require "pl.class" + +local Module_Info = class.Module_Info() + +function Module_Info:_init(name, uri) + self.name = name + self.uri = uri +end + +return Module_Info diff --git a/src/awesomewm.d.tl/scraper/module_info_list.lua b/src/awesomewm.d.tl/scraper/module_info_list.lua new file mode 100644 index 0000000..f2a8b5f --- /dev/null +++ b/src/awesomewm.d.tl/scraper/module_info_list.lua @@ -0,0 +1,28 @@ +local Module_Info = require "entities.Module_Info" +local scraper_utils = require "scraper.utils" +local utils = require "utils" + +local module = {} + +local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a" + +local function extract_module_info(node) + local name = utils.sanitize_string(node:inner_text()) + local uri = node.attr.href + + if not (name and uri) then + error("Can't extract module info from node: " .. node:outer_html()) + end + + return Module_Info(name, uri) +end + +function module.get_modules_from_index(html) + return scraper_utils.scrape( + html, + MODULE_A_TAG_QUERY_SELECTOR, + extract_module_info + ) +end + +return module diff --git a/src/awesomewm.d.tl/scraper/utils.lua b/src/awesomewm.d.tl/scraper/utils.lua new file mode 100644 index 0000000..04280ef --- /dev/null +++ b/src/awesomewm.d.tl/scraper/utils.lua @@ -0,0 +1,25 @@ +local log = require "logger" +local scanner = require "web_sanitize.query.scan_html" + +local scraper_utils = {} + +function scraper_utils.scrape(html, query_selector, extract_callback) + local ret = {} + + scanner.scan_html(html, function(stack) + if stack:is(query_selector) then + local node = stack:current() + local success, info = pcall(extract_callback, node) + + if not success then + log:error { message = info } + else + table.insert(ret, info) + end + end + end) + + return ret +end + +return scraper_utils