diff --git a/rockspecs/awesomewm.d.tl-dev-1.rockspec b/rockspecs/awesomewm.d.tl-dev-1.rockspec
index d91ae98..cc2af55 100644
--- a/rockspecs/awesomewm.d.tl-dev-1.rockspec
+++ b/rockspecs/awesomewm.d.tl-dev-1.rockspec
@@ -12,7 +12,6 @@ dependencies = {
"lualogging 1.6.0",
"inspect 3.1.3",
"ansicolors 1.0.2",
- "htmlparser 0.3.9",
"web_sanitize 1.3.0",
"penlight 1.13.1",
"luasocket 3.1.0-1",
diff --git a/src/awesomewm.d.tl/entities/Function_Info.lua b/src/awesomewm.d.tl/entities/Function_Info.lua
new file mode 100644
index 0000000..956ac11
--- /dev/null
+++ b/src/awesomewm.d.tl/entities/Function_Info.lua
@@ -0,0 +1,23 @@
+local class = require "pl.class"
+local List = require "pl.List"
+
+local Function_Info = class.Module_Doc()
+
+function Function_Info:_init()
+ self.name = ""
+ self.parameters = List()
+ self.return_types = List()
+end
+
+function Function_Info:append_return_type(return_type)
+ self.return_types:append(return_type)
+end
+
+function Function_Info:append_parameter(name, type)
+ self.parameters:append {
+ name = name,
+ type = type,
+ }
+end
+
+return Function_Info
diff --git a/src/awesomewm.d.tl/entities/Module_Doc.lua b/src/awesomewm.d.tl/entities/Module_Doc.lua
new file mode 100644
index 0000000..48ddc31
--- /dev/null
+++ b/src/awesomewm.d.tl/entities/Module_Doc.lua
@@ -0,0 +1,12 @@
+local class = require "pl.class"
+local List = require "pl.List"
+
+local Module_Doc = class.Module_Doc()
+
+function Module_Doc:_init()
+ self.constructors = List()
+ self.methods = List()
+ self.static_functions = List()
+end
+
+return Module_Doc
diff --git a/src/awesomewm.d.tl/entities/Module_Info.lua b/src/awesomewm.d.tl/entities/Module_Info.lua
new file mode 100644
index 0000000..df163d9
--- /dev/null
+++ b/src/awesomewm.d.tl/entities/Module_Info.lua
@@ -0,0 +1,10 @@
+local class = require "pl.class"
+
+local Module_Info = class.Module_Info()
+
+function Module_Info:_init(name, uri)
+ self.name = name
+ self.uri = uri
+end
+
+return Module_Info
diff --git a/src/awesomewm.d.tl/init.lua b/src/awesomewm.d.tl/init.lua
index decd5bb..686c1ee 100644
--- a/src/awesomewm.d.tl/init.lua
+++ b/src/awesomewm.d.tl/init.lua
@@ -3,19 +3,19 @@ local inspect = require "inspect"
local log = require "logger"
local properties = require "properties"
local scraper = require "scraper"
-local generator = require "generator"
+-- local generator = require "generator"
log:info(
inspect { message = "Start extraction", base_url = properties.base_url }
)
--- local index = crawler.fetch(properties.base_url .. properties.index_uri)
--- local modules = scraper.get_modules_from_index(
--- index,
--- properties.ignored_modules
--- )
+local index = crawler.fetch(properties.base_url .. properties.index_uri)
--- log:info(inspect { modules_found = #modules })
+-- local modules =
+-- scraper.get_modules_from_index(index, properties.ignored_modules)
+local module_infos = scraper.module_info_list.get_modules_from_index(index)
+
+log:info(inspect { modules_found = #module_infos })
-- for i = 1, 1 do -- #modules do
-- local m = modules[i]
@@ -25,12 +25,15 @@ log:info(
-- log:info(inspect { items })
-- end
-local page =
- crawler.fetch(properties.base_url .. "/widgets/wibox.widget.imagebox.html")
-local items = scraper.get_doc_from_page(page)
+local html =
+ crawler.fetch(properties.base_url .. "/widgets/wibox.widget.textbox.html")
+local module_doc = scraper.module_doc.get_doc_from_page(html)
+log:info(inspect { module_doc = module_doc })
+
+-- local items = scraper.get_doc_from_page(page)
-- log:info(inspect { items })
-generator.write(
- generator.generate_teal(items),
- properties.out_directory .. "/test.tl"
-)
+-- generator.write(
+-- generator.generate_teal(items),
+-- properties.out_directory .. "/test.tl"
+-- )
diff --git a/src/awesomewm.d.tl/scraper/init.lua b/src/awesomewm.d.tl/scraper/init.lua
index 6aa1cb2..77155e4 100644
--- a/src/awesomewm.d.tl/scraper/init.lua
+++ b/src/awesomewm.d.tl/scraper/init.lua
@@ -1,212 +1,5 @@
-local htmlparser = require "htmlparser"
-local log = require "logger"
-local utils = require "utils"
-
-local scraper = {}
-
-function scraper.get_modules_from_index(html, ignored)
- local document = htmlparser.parse(html)
-
- local modules = utils.map(document "#navigation ul > li a", function(node)
- return {
- name = utils.sanitize_string(node:getcontent()),
- uri = node.attributes.href,
- }
- end)
-
- local filtered_modules = utils.filter(modules, function(module)
- return not utils.has_item(ignored, module.name)
- end)
-
- return filtered_modules
-end
-
-local function extract_first_node_or_fail(node, selector)
- local extracted = node(selector)[1]
-
- if not extracted then
- log:error {
- message = "Can't find `" .. selector .. "` element!",
- node = node:gettext(),
- }
- error "extract_first_node_or_fail"
- end
-
- return extracted
-end
-
-local function extract_first_node(node, selector)
- local extracted = node(selector)[1]
-
- if not extracted then
- return nil
- end
-
- return extracted
-end
-
-local function extract_inner_floating_text(node)
- local html = node:getcontent()
-
- -- Remove inner tags from the html
- for _, n in ipairs(node.nodes) do
- html = utils.replace(html, n:gettext(), "")
- end
-
- return utils.sanitize_string(html)
-end
-
-local function extract_item_content_parameter_list(content)
- for i, n in ipairs(content.nodes) do
- -- The parameters
is the next element after one of these
- if
- n.name == "h3"
- and utils.has_item(
- { "Parameters:", "Type constraints:", "Arguments" },
- utils.sanitize_string(n:getcontent())
- )
- then
- return content.nodes[i + 1]
- end
- end
-
- return nil
-end
-
-local function process_item_header_type(node)
- local types_node = extract_first_node(node, "span.summary_type")
-
- if not types_node then
- return nil
- end
-
- local types_string = string.match(types_node:getcontent(), "%((.-)%)")
- or types_node:getcontent()
- types_string = utils.replace(types_string, "or", ",")
- local splitted = string.gmatch(types_string, "([^,]+)")
-
- local types = {}
- for t in splitted do
- table.insert(types, utils.sanitize_string(t))
- end
-
- return types
-end
-
-local function process_item_header(node)
- local name_node = extract_first_node_or_fail(node, "strong")
-
- -- Remove starting ":" character on method name
- local name = extract_inner_floating_text(name_node):gsub("^:", "")
- local type = process_item_header_type(node)
-
- local item_header = {
- name = name,
- type = type,
- }
-
- return item_header
-end
-
-local function process_item_content_parameters(node)
- local parameter = {}
-
- local name = extract_first_node_or_fail(node, "span.parameter")
- parameter.name = utils.sanitize_string(name:getcontent())
-
- local nested_parameters_list = extract_first_node(node, "ul")
- if nested_parameters_list then
- parameter.type = "table"
- parameter.nested_parameters = utils.map(
- nested_parameters_list "* > li",
- process_item_content_parameters
- )
-
- return parameter
- end
-
- local types_list = utils.map(node ".type", function(type)
- return utils.sanitize_string(type:getcontent())
- end)
- if #types_list > 0 then
- parameter.type = types_list
- end
-
- return parameter
-end
-
-local function process_item_content(content)
- local parameters_list = extract_item_content_parameter_list(content)
- local parameters = parameters_list
- and utils.map(
- parameters_list "* > li",
- process_item_content_parameters
- )
- or {}
-
- local item_content = {
- parameters = #parameters > 0 and parameters or nil,
- }
-
- return item_content
-end
-
-local function process_section_titles(document)
- return utils.map(document "h2.section-header", extract_inner_floating_text)
-end
-
-local function process_section_items(item)
- local headers = utils.map(item "dt", process_item_header)
- local contents = utils.map(item "dd", process_item_content)
-
- if #headers ~= #contents then
- log:error {
- message = "extract_item_content failure: headers and contents don't have the same size",
- headers = #headers,
- contents = #contents,
- }
- error "extract_item_content"
- end
-
- local item_contents = utils.map(headers, function(header, i)
- return {
- name = header.name,
- type = header.type,
- parameters = contents[i].parameters,
- }
- end)
-
- return item_contents
-end
-
-local function process_section_contents(document)
- local section_items = document "dl.function"
-
- local section_contents = utils.map(section_items, process_section_items)
-
- return section_contents
-end
-
-function scraper.get_doc_from_page(html)
- local document = htmlparser.parse(html, 9999)
-
- local sections_titles = process_section_titles(document)
- local section_contents = process_section_contents(document)
-
- if #sections_titles ~= #section_contents then
- log:error {
- message = "get_items_from_page failure: section_titles and section_contents don't have the same size",
- sections_titles = #sections_titles,
- func = #section_contents,
- }
- error "get_items_from_page"
- end
-
- local doc = utils.map(sections_titles, function(title, i)
- return { section = title, items = section_contents[i] }
- end)
-
- return doc
-end
-
-return scraper
+return {
+ module_doc = require "scraper.module_doc",
+ module_info_list = require "scraper.module_info_list",
+ utils = require "scraper.utils",
+}
diff --git a/src/awesomewm.d.tl/scraper/module_doc.lua b/src/awesomewm.d.tl/scraper/module_doc.lua
new file mode 100644
index 0000000..be2efb6
--- /dev/null
+++ b/src/awesomewm.d.tl/scraper/module_doc.lua
@@ -0,0 +1,84 @@
+local Function_Info = require "entities.Function_Info"
+local Module_Doc = require "entities.Module_Doc"
+local scraper_utils = require "scraper.utils"
+local utils = require "utils"
+
+local function extract_function_name(function_name_node)
+ return function_name_node and (function_name_node.attr.name:gsub(".*:", ""))
+end
+
+local function extract_function_return_types(function_return_types_node)
+ if not function_return_types_node then
+ return {}
+ end
+
+ local selector = "span.types .type"
+ local html = function_return_types_node:outer_html()
+
+ return scraper_utils.scrape(html, selector, function(node)
+ return utils.sanitize_string(node:inner_text())
+ end)
+end
+
+local function extract_section_functions(dl)
+ local query_selectors = {
+ function_name = "dt a",
+ function_return_type = "dd ol",
+ }
+
+ return scraper_utils.scrape_tuples(
+ dl,
+ { query_selectors.function_name, query_selectors.function_return_type },
+ function(nodes)
+ local function_info = Function_Info()
+
+ function_info.name =
+ extract_function_name(nodes[query_selectors.function_name])
+ function_info.return_types = extract_function_return_types(
+ nodes[query_selectors.function_return_type]
+ )
+
+ return function_info
+ end
+ )
+end
+
+local module = {}
+
+function module.get_doc_from_page(html)
+ local nodes = scraper_utils.extract_nodes(html, {
+ "h2.section-header",
+ "dl.function",
+ })
+
+ if #nodes:get "h2.section-header" ~= #nodes:get "dl.function" then
+ error "The list aren't the same size!"
+ end
+
+ local module_doc = Module_Doc()
+
+ for i, h2 in ipairs(nodes:get "h2.section-header") do
+ local section_name = utils.sanitize_string(h2:inner_text())
+ local dl_html = nodes:get("dl.function")[i]:outer_html()
+
+ if section_name == "Constructors" then
+ module_doc.constructors = extract_section_functions(dl_html)
+ elseif section_name == "Static module functions" then
+ module_doc.static_functions = extract_section_functions(dl_html)
+ elseif section_name == "Object properties" then
+ print "Not implemented: Deprecated object properties"
+ elseif section_name == "Deprecated object properties" then
+ print "Not implemented: Deprecated object properties"
+ elseif section_name == "Object methods" then
+ module_doc.methods = extract_section_functions(dl_html)
+ elseif section_name == "Signals" then
+ print "Not implemented: Signals"
+ else
+ error("Unknown section name: " .. section_name)
+ end
+ end
+
+ return module_doc
+end
+
+return module
diff --git a/src/awesomewm.d.tl/scraper/module_info_list.lua b/src/awesomewm.d.tl/scraper/module_info_list.lua
new file mode 100644
index 0000000..f2a8b5f
--- /dev/null
+++ b/src/awesomewm.d.tl/scraper/module_info_list.lua
@@ -0,0 +1,28 @@
+local Module_Info = require "entities.Module_Info"
+local scraper_utils = require "scraper.utils"
+local utils = require "utils"
+
+local module = {}
+
+local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
+
+local function extract_module_info(node)
+ local name = utils.sanitize_string(node:inner_text())
+ local uri = node.attr.href
+
+ if not (name and uri) then
+ error("Can't extract module info from node: " .. node:outer_html())
+ end
+
+ return Module_Info(name, uri)
+end
+
+function module.get_modules_from_index(html)
+ return scraper_utils.scrape(
+ html,
+ MODULE_A_TAG_QUERY_SELECTOR,
+ extract_module_info
+ )
+end
+
+return module
diff --git a/src/awesomewm.d.tl/scraper/utils.lua b/src/awesomewm.d.tl/scraper/utils.lua
new file mode 100644
index 0000000..4668c0c
--- /dev/null
+++ b/src/awesomewm.d.tl/scraper/utils.lua
@@ -0,0 +1,68 @@
+local List = require "pl.List"
+local log = require "logger"
+local Map = require "pl.Map"
+local scanner = require "web_sanitize.query.scan_html"
+local tablex = require "pl.tablex"
+
+local scraper_utils = {}
+
+function scraper_utils.scrape(html, query_selector, extract_callback)
+ local ret = {}
+
+ scanner.scan_html(html, function(stack)
+ if stack:is(query_selector) then
+ local node = stack:current()
+ local success, info = pcall(extract_callback, node)
+
+ if not success then
+ log:error { message = info }
+ else
+ table.insert(ret, info)
+ end
+ end
+ end)
+
+ return ret
+end
+
+function scraper_utils.extract_nodes(html, query_selectors)
+ local siblings = Map()
+
+ tablex.foreach(query_selectors, function(query_selector)
+ siblings:set(query_selector, List())
+ end)
+
+ scanner.scan_html(html, function(stack)
+ tablex.foreach(query_selectors, function(query_selector)
+ if stack:is(query_selector) then
+ siblings:get(query_selector):append(stack:current())
+ end
+ end)
+ end)
+
+ return siblings
+end
+
+function scraper_utils.scrape_tuples(html, query_selectors, extract_callback)
+ local nodes = scraper_utils.extract_nodes(html, query_selectors)
+
+ local ret = {}
+
+ for i = 1, #nodes:get(query_selectors[1]) do
+ local node_list = {}
+ tablex.foreach(query_selectors, function(query_selector)
+ node_list[query_selector] = nodes:get(query_selector)[i] or nil
+ end)
+ local success, info = pcall(extract_callback, node_list)
+
+ if not success then
+ log:error { message = info }
+ else
+ table.insert(ret, info)
+ end
+ end
+
+ return ret
+end
+
+return scraper_utils