From 9dee667af275417207fc845cc8bdb33507321683 Mon Sep 17 00:00:00 2001 From: Aire-One Date: Thu, 29 Sep 2022 19:13:58 +0200 Subject: [PATCH] feat(scraper): use the new scraper --- src/awesomewm.d.tl/init.lua | 32 ++-- src/awesomewm.d.tl/scraper/init.lua | 217 +--------------------------- 2 files changed, 23 insertions(+), 226 deletions(-) diff --git a/src/awesomewm.d.tl/init.lua b/src/awesomewm.d.tl/init.lua index decd5bb..bcb67ee 100644 --- a/src/awesomewm.d.tl/init.lua +++ b/src/awesomewm.d.tl/init.lua @@ -3,19 +3,19 @@ local inspect = require "inspect" local log = require "logger" local properties = require "properties" local scraper = require "scraper" -local generator = require "generator" +-- local generator = require "generator" log:info( inspect { message = "Start extraction", base_url = properties.base_url } ) --- local index = crawler.fetch(properties.base_url .. properties.index_uri) --- local modules = scraper.get_modules_from_index( --- index, --- properties.ignored_modules --- ) +local index = crawler.fetch(properties.base_url .. properties.index_uri) --- log:info(inspect { modules_found = #modules }) +-- local modules = +-- scraper.get_modules_from_index(index, properties.ignored_modules) +local module_infos = scraper.module_info_list.get_modules_from_index(index) + +log:info(inspect { modules_found = #module_infos }) -- for i = 1, 1 do -- #modules do -- local m = modules[i] @@ -25,12 +25,16 @@ log:info( -- log:info(inspect { items }) -- end -local page = - crawler.fetch(properties.base_url .. "/widgets/wibox.widget.imagebox.html") -local items = scraper.get_doc_from_page(page) +local html = + crawler.fetch(properties.base_url .. "/widgets/wibox.widget.textbox.html") +local module_doc = scraper.module_doc.get_doc_from_page(html) +log:info(inspect { module_doc = module_doc }) + +-- local items = scraper.get_doc_from_page(page) -- log:info(inspect { items }) -generator.write( - generator.generate_teal(items), - properties.out_directory .. "/test.tl" -) + +-- generator.write( +-- generator.generate_teal(items), +-- properties.out_directory .. "/test.tl" +-- ) diff --git a/src/awesomewm.d.tl/scraper/init.lua b/src/awesomewm.d.tl/scraper/init.lua index 6aa1cb2..77155e4 100644 --- a/src/awesomewm.d.tl/scraper/init.lua +++ b/src/awesomewm.d.tl/scraper/init.lua @@ -1,212 +1,5 @@ -local htmlparser = require "htmlparser" -local log = require "logger" -local utils = require "utils" - -local scraper = {} - -function scraper.get_modules_from_index(html, ignored) - local document = htmlparser.parse(html) - - local modules = utils.map(document "#navigation ul > li a", function(node) - return { - name = utils.sanitize_string(node:getcontent()), - uri = node.attributes.href, - } - end) - - local filtered_modules = utils.filter(modules, function(module) - return not utils.has_item(ignored, module.name) - end) - - return filtered_modules -end - -local function extract_first_node_or_fail(node, selector) - local extracted = node(selector)[1] - - if not extracted then - log:error { - message = "Can't find `" .. selector .. "` element!", - node = node:gettext(), - } - error "extract_first_node_or_fail" - end - - return extracted -end - -local function extract_first_node(node, selector) - local extracted = node(selector)[1] - - if not extracted then - return nil - end - - return extracted -end - -local function extract_inner_floating_text(node) - local html = node:getcontent() - - -- Remove inner tags from the html - for _, n in ipairs(node.nodes) do - html = utils.replace(html, n:gettext(), "") - end - - return utils.sanitize_string(html) -end - -local function extract_item_content_parameter_list(content) - for i, n in ipairs(content.nodes) do - -- The parameters