Move to Teal 🚀 #6
|
@ -6,7 +6,7 @@ local tablex = require "pl.tablex"
|
||||||
|
|
||||||
local crawler = {}
|
local crawler = {}
|
||||||
|
|
||||||
local function http_request(url)
|
local function http_request(url: string): string
|
||||||
local body, code_or_error = http.request(url)
|
local body, code_or_error = http.request(url)
|
||||||
|
|
||||||
if not body then
|
if not body then
|
||||||
|
@ -14,7 +14,7 @@ local function http_request(url)
|
||||||
error { "Request failed", err = err, url = url }
|
error { "Request failed", err = err, url = url }
|
||||||
end
|
end
|
||||||
|
|
||||||
local code = code_or_error
|
local code = code_or_error as integer
|
||||||
|
|
||||||
if code < 200 and code >= 300 then
|
if code < 200 and code >= 300 then
|
||||||
error {
|
error {
|
||||||
|
@ -27,7 +27,7 @@ local function http_request(url)
|
||||||
return body
|
return body
|
||||||
end
|
end
|
||||||
|
|
||||||
local function get_resource_by_protocol(url)
|
local function get_resource_by_protocol(url: string): boolean, string
|
||||||
local protocol, resource = url:match "^(%a+)://(.*)$"
|
local protocol, resource = url:match "^(%a+)://(.*)$"
|
||||||
|
|
||||||
if not not tablex.find({ "http", "https" }, protocol) then
|
if not not tablex.find({ "http", "https" }, protocol) then
|
||||||
|
@ -45,7 +45,7 @@ local function get_resource_by_protocol(url)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
function crawler.fetch(url)
|
function crawler.fetch(url: string): string
|
||||||
local success, result = get_resource_by_protocol(url)
|
local success, result = get_resource_by_protocol(url)
|
||||||
|
|
||||||
if not success then
|
if not success then
|
|
@ -1,23 +0,0 @@
|
||||||
local class = require "pl.class"
|
|
||||||
local List = require "pl.List"
|
|
||||||
|
|
||||||
local Function_Info = class.Module_Doc()
|
|
||||||
|
|
||||||
function Function_Info:_init()
|
|
||||||
self.name = ""
|
|
||||||
self.parameters = List()
|
|
||||||
self.return_types = List()
|
|
||||||
end
|
|
||||||
|
|
||||||
function Function_Info:append_return_type(return_type)
|
|
||||||
self.return_types:append(return_type)
|
|
||||||
end
|
|
||||||
|
|
||||||
function Function_Info:append_parameter(name, type)
|
|
||||||
self.parameters:append {
|
|
||||||
name = name,
|
|
||||||
type = type,
|
|
||||||
}
|
|
||||||
end
|
|
||||||
|
|
||||||
return Function_Info
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
local List = require "pl.List"
|
||||||
|
|
||||||
|
local record Parameter
|
||||||
|
name: string
|
||||||
|
type: string
|
||||||
|
end
|
||||||
|
|
||||||
|
local record Function_Info
|
||||||
|
metamethod __call: function(Function_Info): Function_Info
|
||||||
|
|
||||||
|
name: string
|
||||||
|
parameters: List<Parameter>
|
||||||
|
return_types: List<string>
|
||||||
|
|
||||||
|
append_parameter: function(self: Function_Info, parameter: string)
|
||||||
|
append_return_type: function(self: Function_Info, return_type: string)
|
||||||
|
end
|
||||||
|
|
||||||
|
function Function_Info:append_parameter(name: string, type: string)
|
||||||
|
self.parameters:append {
|
||||||
|
name = name,
|
||||||
|
type = type,
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
function Function_Info:append_return_type(return_type: string)
|
||||||
|
self.return_types:append(return_type)
|
||||||
|
end
|
||||||
|
|
||||||
|
return Function_Info
|
|
@ -1,12 +0,0 @@
|
||||||
local class = require "pl.class"
|
|
||||||
local List = require "pl.List"
|
|
||||||
|
|
||||||
local Module_Doc = class.Module_Doc()
|
|
||||||
|
|
||||||
function Module_Doc:_init()
|
|
||||||
self.constructors = List()
|
|
||||||
self.methods = List()
|
|
||||||
self.static_functions = List()
|
|
||||||
end
|
|
||||||
|
|
||||||
return Module_Doc
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
local Function_Info = require "entities.Function_Info"
|
||||||
|
local List = require "pl.List"
|
||||||
|
|
||||||
|
local record Module_Doc
|
||||||
|
metamethod __call: function(Module_Doc): Module_Doc
|
||||||
|
|
||||||
|
constructors: List<Function_Info>
|
||||||
|
methods: List<Function_Info>
|
||||||
|
static_functions: List<Function_Info>
|
||||||
|
end
|
||||||
|
|
||||||
|
return Module_Doc
|
|
@ -1,10 +0,0 @@
|
||||||
local class = require "pl.class"
|
|
||||||
|
|
||||||
local Module_Info = class.Module_Info()
|
|
||||||
|
|
||||||
function Module_Info:_init(name, uri)
|
|
||||||
self.name = name
|
|
||||||
self.uri = uri
|
|
||||||
end
|
|
||||||
|
|
||||||
return Module_Info
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
local record Module_Info
|
||||||
|
metamethod __call: function(Module_Info, name: string, uri: string): Module_Info
|
||||||
|
|
||||||
|
name: string
|
||||||
|
uri: string
|
||||||
|
end
|
||||||
|
|
||||||
|
return Module_Info
|
|
@ -5,7 +5,7 @@ local log = require "logger"
|
||||||
local utils = require "utils"
|
local utils = require "utils"
|
||||||
local snippets = require "generator.snippets"
|
local snippets = require "generator.snippets"
|
||||||
|
|
||||||
local tmpl = (function(mod)
|
local tmpl = (function(mod: string): string
|
||||||
local package_path = utils.do_or_fail(path.package_path, mod)
|
local package_path = utils.do_or_fail(path.package_path, mod)
|
||||||
local package_dir = path.dirname(package_path)
|
local package_dir = path.dirname(package_path)
|
||||||
return utils.do_or_fail(file.read, package_dir .. "/template.tl.tmpl", false)
|
return utils.do_or_fail(file.read, package_dir .. "/template.tl.tmpl", false)
|
||||||
|
@ -13,10 +13,22 @@ end)(...)
|
||||||
|
|
||||||
local generator = {}
|
local generator = {}
|
||||||
|
|
||||||
function generator.generate_teal(data)
|
local record Generate_Teal_Data_Record
|
||||||
|
section: string
|
||||||
|
items: { snippets.Anonymous_Function_Record }
|
||||||
|
end
|
||||||
|
function generator.generate_teal(data: { Generate_Teal_Data_Record }): string
|
||||||
-- TODO : add the required modules to the generated code
|
-- TODO : add the required modules to the generated code
|
||||||
-- TODO : replace this with a proper way to get the module name (will also probably need the module path)
|
-- TODO : replace this with a proper way to get the module name (will also probably need the module path)
|
||||||
local module_data = { name = "module_name" }
|
local record Module_Data_Record
|
||||||
|
name: string
|
||||||
|
static_functions: { snippets.Anonymous_Function_Record }
|
||||||
|
constructors: { snippets.Anonymous_Function_Record }
|
||||||
|
methods: { string }
|
||||||
|
properties: { snippets.Anonymous_Function_Record }
|
||||||
|
signals: { snippets.Anonymous_Function_Record }
|
||||||
|
end
|
||||||
|
local module_data: Module_Data_Record = { name = "module_name" }
|
||||||
for _, item in ipairs(data) do
|
for _, item in ipairs(data) do
|
||||||
if item.section == "Static functions" then
|
if item.section == "Static functions" then
|
||||||
-- TODO
|
-- TODO
|
||||||
|
@ -49,19 +61,19 @@ function generator.generate_teal(data)
|
||||||
return utils.do_or_fail(template.substitute, tmpl, env)
|
return utils.do_or_fail(template.substitute, tmpl, env)
|
||||||
end
|
end
|
||||||
|
|
||||||
function generator.write(file_content, file_path)
|
function generator.write(file_content:string, file_path: string)
|
||||||
-- Make sure the directory we want to write the file to exists
|
-- Make sure the directory we want to write the file to exists
|
||||||
local directory = path.dirname(file_path)
|
local directory = path.dirname(file_path)
|
||||||
if not path.isdir(directory) then
|
if not path.isdir(directory) then
|
||||||
path.mkdir(directory)
|
path.mkdir(directory)
|
||||||
end
|
end
|
||||||
|
|
||||||
local success, error = file.write(file_path, file_content, false)
|
local success, error_message = file.write(file_path, file_content, false)
|
||||||
|
|
||||||
if not success then
|
if not success then
|
||||||
log:error {
|
log:error {
|
||||||
"generator.write error",
|
"generator.write error",
|
||||||
error = error,
|
error = error_message,
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
end
|
end
|
|
@ -2,10 +2,10 @@ local utils = require "utils"
|
||||||
local template = require "pl.template"
|
local template = require "pl.template"
|
||||||
|
|
||||||
-- Refactor scraper code to use pl.List objects
|
-- Refactor scraper code to use pl.List objects
|
||||||
local function join(arr, delim)
|
local function join<T>(arr: { string }, delim: string): string
|
||||||
local ret = ""
|
local ret = ""
|
||||||
for i, type in ipairs(arr) do
|
for i, t in ipairs(arr) do
|
||||||
ret = ret .. type
|
ret = ret .. t
|
||||||
if i < #arr then
|
if i < #arr then
|
||||||
ret = ret .. delim
|
ret = ret .. delim
|
||||||
end
|
end
|
||||||
|
@ -13,13 +13,27 @@ local function join(arr, delim)
|
||||||
return ret
|
return ret
|
||||||
end
|
end
|
||||||
|
|
||||||
local snippets = {}
|
local record Anonymous_Function_Parameter_Record
|
||||||
|
name: string
|
||||||
|
type: { string }
|
||||||
|
end
|
||||||
|
|
||||||
function snippets.types_list(types)
|
local record Anonymous_Function_Record
|
||||||
|
name: string
|
||||||
|
parameters: { Anonymous_Function_Parameter_Record }
|
||||||
|
returns: { string }
|
||||||
|
end
|
||||||
|
|
||||||
|
local snippets = {
|
||||||
|
Anonymous_Function_Parameter_Record = Anonymous_Function_Parameter_Record,
|
||||||
|
Anonymous_Function_Record = Anonymous_Function_Record,
|
||||||
|
}
|
||||||
|
|
||||||
|
function snippets.types_list(types: { string }): string
|
||||||
return join(types, ", ")
|
return join(types, ", ")
|
||||||
end
|
end
|
||||||
|
|
||||||
function snippets.anonymous_function(item)
|
function snippets.anonymous_function(item: Anonymous_Function_Record): string
|
||||||
local parameters_string = ""
|
local parameters_string = ""
|
||||||
if item.parameters then
|
if item.parameters then
|
||||||
for i, param in ipairs(item.parameters) do
|
for i, param in ipairs(item.parameters) do
|
|
@ -1,18 +0,0 @@
|
||||||
local ansicolors = require "ansicolors"
|
|
||||||
local console = require "logging.console"
|
|
||||||
local ll = require "logging"
|
|
||||||
|
|
||||||
local log = console {
|
|
||||||
logLevel = ll.DEBUG,
|
|
||||||
destination = "stdout",
|
|
||||||
timestampPattern = "[%y-%m-%d %H:%M:%S]",
|
|
||||||
logPatterns = {
|
|
||||||
[ll.DEBUG] = ansicolors "%date%{cyan} %level %message %{reset}(%source)\n",
|
|
||||||
[ll.INFO] = ansicolors "%date %level %message\n",
|
|
||||||
[ll.WARN] = ansicolors "%date%{yellow} %level %message\n",
|
|
||||||
[ll.ERROR] = ansicolors "%date%{red bright} %level %message %{reset}(%source)\n",
|
|
||||||
[ll.FATAL] = ansicolors "%date%{magenta bright} %level %message %{reset}(%source)\n",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
return log
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
local ansicolors = require "ansicolors"
|
||||||
|
local logging_console = require "logging.console"
|
||||||
|
|
||||||
|
local log = logging_console {
|
||||||
|
logLevel = "DEBUG",
|
||||||
|
destination = "stdout",
|
||||||
|
timestampPattern = "[%y-%m-%d %H:%M:%S]",
|
||||||
|
logPatterns = {
|
||||||
|
DEBUG = ansicolors "%date%{cyan} %level %message %{reset}(%source)\n",
|
||||||
|
INFO = ansicolors "%date %level %message\n",
|
||||||
|
WARN = ansicolors "%date%{yellow} %level %message\n",
|
||||||
|
ERROR = ansicolors "%date%{red bright} %level %message %{reset}(%source)\n",
|
||||||
|
FATAL = ansicolors "%date%{magenta bright} %level %message %{reset}(%source)\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return log
|
|
@ -1,66 +0,0 @@
|
||||||
local properties = {}
|
|
||||||
|
|
||||||
-- properties.base_url = "https://awesomewm.org/apidoc"
|
|
||||||
properties.base_url = "file:///usr/share/doc/awesome/doc"
|
|
||||||
|
|
||||||
properties.index_uri = "/index.html"
|
|
||||||
|
|
||||||
properties.out_directory = "generated"
|
|
||||||
|
|
||||||
--- Pages from the navigation menu to ignore.
|
|
||||||
-- Sets to ignore documentations and sample file. I also added libraries with
|
|
||||||
-- low quality API documentation, I'll probably work on them later, lets start
|
|
||||||
-- with what works the best first.
|
|
||||||
properties.ignored_modules = {
|
|
||||||
-- Sample files
|
|
||||||
"rc.lua",
|
|
||||||
"theme.lua",
|
|
||||||
|
|
||||||
-- Utility libraries
|
|
||||||
"gears.debug",
|
|
||||||
"gears.filesystem",
|
|
||||||
"gears.geometry",
|
|
||||||
"gears.math",
|
|
||||||
"gears.object",
|
|
||||||
"gears.protected_call",
|
|
||||||
"gears.sort",
|
|
||||||
"gears.string",
|
|
||||||
"gears.table",
|
|
||||||
"gears.wallpaper",
|
|
||||||
|
|
||||||
-- Theme related libraries
|
|
||||||
"beautiful",
|
|
||||||
"gears.color",
|
|
||||||
"gears.shape",
|
|
||||||
|
|
||||||
-- Classes
|
|
||||||
"awful.widget.common",
|
|
||||||
"gears.cache",
|
|
||||||
"gears.matrix",
|
|
||||||
"menubar.icon_theme",
|
|
||||||
"menubar.index_theme",
|
|
||||||
"signals",
|
|
||||||
"wibox.drawable",
|
|
||||||
"wibox.hierarchy",
|
|
||||||
"wibox.widget.base",
|
|
||||||
"xproperties",
|
|
||||||
|
|
||||||
-- Documentation
|
|
||||||
"Authors",
|
|
||||||
"Readme",
|
|
||||||
"Contributing",
|
|
||||||
"The Widget system",
|
|
||||||
"Creating new widget",
|
|
||||||
"Default configuration file documentation",
|
|
||||||
"Change Awesome appearance",
|
|
||||||
"My first Awesome",
|
|
||||||
"The AwesomeWM client layout system",
|
|
||||||
"Startup options",
|
|
||||||
"Building and Testing",
|
|
||||||
"Using Cairo and LGI",
|
|
||||||
"Tips for upgrading your configuration",
|
|
||||||
"NEWS",
|
|
||||||
"FAQ",
|
|
||||||
}
|
|
||||||
|
|
||||||
return properties
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
local record Properties
|
||||||
|
base_url: string
|
||||||
|
index_uri: string
|
||||||
|
|
||||||
|
out_directory: string
|
||||||
|
|
||||||
|
--- Pages from the navigation menu to ignore.
|
||||||
|
-- Sets to ignore documentations and sample file. I also added libraries with
|
||||||
|
-- low quality API documentation, I'll probably work on them later, lets start
|
||||||
|
-- with what works the best first.
|
||||||
|
ignored_modules: { string }
|
||||||
|
end
|
||||||
|
|
||||||
|
local properties: Properties = {
|
||||||
|
-- base_url = "https://awesomewm.org/apidoc",
|
||||||
|
base_url = "file:///usr/share/doc/awesome/doc",
|
||||||
|
index_uri = "/index.html",
|
||||||
|
out_directory = "generated",
|
||||||
|
ignored_modules = {
|
||||||
|
-- Sample files
|
||||||
|
"rc.lua",
|
||||||
|
"theme.lua",
|
||||||
|
|
||||||
|
-- Utility libraries
|
||||||
|
"gears.debug",
|
||||||
|
"gears.filesystem",
|
||||||
|
"gears.geometry",
|
||||||
|
"gears.math",
|
||||||
|
"gears.object",
|
||||||
|
"gears.protected_call",
|
||||||
|
"gears.sort",
|
||||||
|
"gears.string",
|
||||||
|
"gears.table",
|
||||||
|
"gears.wallpaper",
|
||||||
|
|
||||||
|
-- Theme related libraries
|
||||||
|
"beautiful",
|
||||||
|
"gears.color",
|
||||||
|
"gears.shape",
|
||||||
|
|
||||||
|
-- Classes
|
||||||
|
"awful.widget.common",
|
||||||
|
"gears.cache",
|
||||||
|
"gears.matrix",
|
||||||
|
"menubar.icon_theme",
|
||||||
|
"menubar.index_theme",
|
||||||
|
"signals",
|
||||||
|
"wibox.drawable",
|
||||||
|
"wibox.hierarchy",
|
||||||
|
"wibox.widget.base",
|
||||||
|
"xproperties",
|
||||||
|
|
||||||
|
-- Documentation
|
||||||
|
"Authors",
|
||||||
|
"Readme",
|
||||||
|
"Contributing",
|
||||||
|
"The Widget system",
|
||||||
|
"Creating new widget",
|
||||||
|
"Default configuration file documentation",
|
||||||
|
"Change Awesome appearance",
|
||||||
|
"My first Awesome",
|
||||||
|
"The AwesomeWM client layout system",
|
||||||
|
"Startup options",
|
||||||
|
"Building and Testing",
|
||||||
|
"Using Cairo and LGI",
|
||||||
|
"Tips for upgrading your configuration",
|
||||||
|
"NEWS",
|
||||||
|
"FAQ",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return properties
|
|
@ -1,13 +1,15 @@
|
||||||
local Function_Info = require "entities.Function_Info"
|
local Function_Info = require "entities.Function_Info"
|
||||||
|
local List = require "pl.List"
|
||||||
local Module_Doc = require "entities.Module_Doc"
|
local Module_Doc = require "entities.Module_Doc"
|
||||||
|
local scan = require "web_sanitize.query.scan_html"
|
||||||
local scraper_utils = require "scraper.utils"
|
local scraper_utils = require "scraper.utils"
|
||||||
local utils = require "utils"
|
local utils = require "utils"
|
||||||
|
|
||||||
local function extract_function_name(function_name_node)
|
local function extract_function_name(function_name_node: scan.HTMLNode): string
|
||||||
return function_name_node and (function_name_node.attr.name:gsub(".*:", ""))
|
return function_name_node and ((function_name_node.attr.name as string):gsub(".*:", ""))
|
||||||
end
|
end
|
||||||
|
|
||||||
local function extract_function_return_types(function_return_types_node)
|
local function extract_function_return_types(function_return_types_node: scan.HTMLNode): { string }
|
||||||
if not function_return_types_node then
|
if not function_return_types_node then
|
||||||
return {}
|
return {}
|
||||||
end
|
end
|
||||||
|
@ -15,12 +17,12 @@ local function extract_function_return_types(function_return_types_node)
|
||||||
local selector = "span.types .type"
|
local selector = "span.types .type"
|
||||||
local html = function_return_types_node:outer_html()
|
local html = function_return_types_node:outer_html()
|
||||||
|
|
||||||
return scraper_utils.scrape(html, selector, function(node)
|
return scraper_utils.scrape(html, selector, function(node: scan.HTMLNode): string
|
||||||
return utils.sanitize_string(node:inner_text())
|
return utils.sanitize_string(node:inner_text())
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
local function extract_section_functions(dl)
|
local function extract_section_functions(dl: string): { Function_Info }
|
||||||
local query_selectors = {
|
local query_selectors = {
|
||||||
function_name = "dt a",
|
function_name = "dt a",
|
||||||
function_return_type = "dd ol",
|
function_return_type = "dd ol",
|
||||||
|
@ -29,13 +31,15 @@ local function extract_section_functions(dl)
|
||||||
return scraper_utils.scrape_tuples(
|
return scraper_utils.scrape_tuples(
|
||||||
dl,
|
dl,
|
||||||
{ query_selectors.function_name, query_selectors.function_return_type },
|
{ query_selectors.function_name, query_selectors.function_return_type },
|
||||||
function(nodes)
|
function(nodes: { string : scan.HTMLNode | nil }): Function_Info
|
||||||
local function_info = Function_Info()
|
local function_info = Function_Info()
|
||||||
|
|
||||||
function_info.name =
|
function_info.name =
|
||||||
extract_function_name(nodes[query_selectors.function_name])
|
extract_function_name(nodes[query_selectors.function_name])
|
||||||
function_info.return_types = extract_function_return_types(
|
function_info.return_types = List(
|
||||||
nodes[query_selectors.function_return_type]
|
extract_function_return_types(
|
||||||
|
nodes[query_selectors.function_return_type]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return function_info
|
return function_info
|
||||||
|
@ -45,7 +49,7 @@ end
|
||||||
|
|
||||||
local module = {}
|
local module = {}
|
||||||
|
|
||||||
function module.get_doc_from_page(html)
|
function module.get_doc_from_page(html: string): Module_Doc
|
||||||
local nodes = scraper_utils.extract_nodes(html, {
|
local nodes = scraper_utils.extract_nodes(html, {
|
||||||
"h2.section-header",
|
"h2.section-header",
|
||||||
"dl.function",
|
"dl.function",
|
||||||
|
@ -57,20 +61,21 @@ function module.get_doc_from_page(html)
|
||||||
|
|
||||||
local module_doc = Module_Doc()
|
local module_doc = Module_Doc()
|
||||||
|
|
||||||
for i, h2 in ipairs(nodes:get "h2.section-header") do
|
for i = 1, #nodes:get("h2.section-header") do
|
||||||
|
local h2 = nodes:get("h2.section-header")[i]
|
||||||
local section_name = utils.sanitize_string(h2:inner_text())
|
local section_name = utils.sanitize_string(h2:inner_text())
|
||||||
local dl_html = nodes:get("dl.function")[i]:outer_html()
|
local dl_html = nodes:get("dl.function")[i]:outer_html()
|
||||||
|
|
||||||
if section_name == "Constructors" then
|
if section_name == "Constructors" then
|
||||||
module_doc.constructors = extract_section_functions(dl_html)
|
module_doc.constructors = List(extract_section_functions(dl_html))
|
||||||
elseif section_name == "Static module functions" then
|
elseif section_name == "Static module functions" then
|
||||||
module_doc.static_functions = extract_section_functions(dl_html)
|
module_doc.static_functions = List(extract_section_functions(dl_html))
|
||||||
elseif section_name == "Object properties" then
|
elseif section_name == "Object properties" then
|
||||||
print "Not implemented: Deprecated object properties"
|
print "Not implemented: Deprecated object properties"
|
||||||
elseif section_name == "Deprecated object properties" then
|
elseif section_name == "Deprecated object properties" then
|
||||||
print "Not implemented: Deprecated object properties"
|
print "Not implemented: Deprecated object properties"
|
||||||
elseif section_name == "Object methods" then
|
elseif section_name == "Object methods" then
|
||||||
module_doc.methods = extract_section_functions(dl_html)
|
module_doc.methods = List(extract_section_functions(dl_html))
|
||||||
elseif section_name == "Signals" then
|
elseif section_name == "Signals" then
|
||||||
print "Not implemented: Signals"
|
print "Not implemented: Signals"
|
||||||
else
|
else
|
|
@ -1,4 +1,5 @@
|
||||||
local Module_Info = require "entities.Module_Info"
|
local Module_Info = require "entities.Module_Info"
|
||||||
|
local scan = require "web_sanitize.query.scan_html"
|
||||||
local scraper_utils = require "scraper.utils"
|
local scraper_utils = require "scraper.utils"
|
||||||
local utils = require "utils"
|
local utils = require "utils"
|
||||||
|
|
||||||
|
@ -6,9 +7,9 @@ local module = {}
|
||||||
|
|
||||||
local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
|
local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
|
||||||
|
|
||||||
local function extract_module_info(node)
|
local function extract_module_info(node: scan.HTMLNode): Module_Info
|
||||||
local name = utils.sanitize_string(node:inner_text())
|
local name = utils.sanitize_string(node:inner_text())
|
||||||
local uri = node.attr.href
|
local uri = node.attr.href as string
|
||||||
|
|
||||||
if not (name and uri) then
|
if not (name and uri) then
|
||||||
error("Can't extract module info from node: " .. node:outer_html())
|
error("Can't extract module info from node: " .. node:outer_html())
|
||||||
|
@ -17,7 +18,7 @@ local function extract_module_info(node)
|
||||||
return Module_Info(name, uri)
|
return Module_Info(name, uri)
|
||||||
end
|
end
|
||||||
|
|
||||||
function module.get_modules_from_index(html)
|
function module.get_modules_from_index(html: string): { Module_Info }
|
||||||
return scraper_utils.scrape(
|
return scraper_utils.scrape(
|
||||||
html,
|
html,
|
||||||
MODULE_A_TAG_QUERY_SELECTOR,
|
MODULE_A_TAG_QUERY_SELECTOR,
|
|
@ -1,68 +0,0 @@
|
||||||
local List = require "pl.List"
|
|
||||||
local log = require "logger"
|
|
||||||
local Map = require "pl.Map"
|
|
||||||
local scanner = require "web_sanitize.query.scan_html"
|
|
||||||
local tablex = require "pl.tablex"
|
|
||||||
|
|
||||||
local scraper_utils = {}
|
|
||||||
|
|
||||||
function scraper_utils.scrape(html, query_selector, extract_callback)
|
|
||||||
local ret = {}
|
|
||||||
|
|
||||||
scanner.scan_html(html, function(stack)
|
|
||||||
if stack:is(query_selector) then
|
|
||||||
local node = stack:current()
|
|
||||||
local success, info = pcall(extract_callback, node)
|
|
||||||
|
|
||||||
if not success then
|
|
||||||
log:error { message = info }
|
|
||||||
else
|
|
||||||
table.insert(ret, info)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end)
|
|
||||||
|
|
||||||
return ret
|
|
||||||
end
|
|
||||||
|
|
||||||
function scraper_utils.extract_nodes(html, query_selectors)
|
|
||||||
local siblings = Map()
|
|
||||||
|
|
||||||
tablex.foreach(query_selectors, function(query_selector)
|
|
||||||
siblings:set(query_selector, List())
|
|
||||||
end)
|
|
||||||
|
|
||||||
scanner.scan_html(html, function(stack)
|
|
||||||
tablex.foreach(query_selectors, function(query_selector)
|
|
||||||
if stack:is(query_selector) then
|
|
||||||
siblings:get(query_selector):append(stack:current())
|
|
||||||
end
|
|
||||||
end)
|
|
||||||
end)
|
|
||||||
|
|
||||||
return siblings
|
|
||||||
end
|
|
||||||
|
|
||||||
function scraper_utils.scrape_tuples(html, query_selectors, extract_callback)
|
|
||||||
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
|
||||||
|
|
||||||
local ret = {}
|
|
||||||
|
|
||||||
for i = 1, #nodes:get(query_selectors[1]) do
|
|
||||||
local node_list = {}
|
|
||||||
tablex.foreach(query_selectors, function(query_selector)
|
|
||||||
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
|
||||||
end)
|
|
||||||
local success, info = pcall(extract_callback, node_list)
|
|
||||||
|
|
||||||
if not success then
|
|
||||||
log:error { message = info }
|
|
||||||
else
|
|
||||||
table.insert(ret, info)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return ret
|
|
||||||
end
|
|
||||||
|
|
||||||
return scraper_utils
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
local List = require "pl.List"
|
||||||
|
local log = require "logger"
|
||||||
|
local Map = require "pl.Map"
|
||||||
|
local scan = require "web_sanitize.query.scan_html"
|
||||||
|
local scanner = require "web_sanitize.query.scan_html"
|
||||||
|
local tablex = require "pl.tablex"
|
||||||
|
|
||||||
|
local scraper_utils = {}
|
||||||
|
|
||||||
|
function scraper_utils.scrape<T>(html: string, query_selector: string, extract_callback: function(node: scan.HTMLNode): T): { T }
|
||||||
|
local ret: { T } = {}
|
||||||
|
|
||||||
|
scanner.scan_html(html, function(stack: scan.NodeStack)
|
||||||
|
if stack:is(query_selector) then
|
||||||
|
local node = stack:current()
|
||||||
|
local success, info_or_error = pcall(extract_callback, node)
|
||||||
|
|
||||||
|
if not success then
|
||||||
|
local error_message = info_or_error as string
|
||||||
|
log:error { message = error_message }
|
||||||
|
else
|
||||||
|
local info = info_or_error as T
|
||||||
|
table.insert(ret, info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
function scraper_utils.extract_nodes(html: string, query_selectors: { string }): Map<string, List<scan.HTMLNode>>
|
||||||
|
local siblings: Map<string, List<scan.HTMLNode>> = Map()
|
||||||
|
|
||||||
|
tablex.foreach(query_selectors, function(query_selector: string)
|
||||||
|
siblings:set(query_selector, List())
|
||||||
|
end)
|
||||||
|
|
||||||
|
scanner.scan_html(html, function(stack: scan.NodeStack)
|
||||||
|
tablex.foreach(query_selectors, function(query_selector: string)
|
||||||
|
if stack:is(query_selector) then
|
||||||
|
siblings:get(query_selector):append(stack:current())
|
||||||
|
end
|
||||||
|
end)
|
||||||
|
end)
|
||||||
|
|
||||||
|
return siblings
|
||||||
|
end
|
||||||
|
|
||||||
|
function scraper_utils.scrape_tuples<T>(html: string, query_selectors: { string }, extract_callback: function(tuple: { string : scan.HTMLNode | nil }): T): { T }
|
||||||
|
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
||||||
|
|
||||||
|
local ret: { T } = {}
|
||||||
|
|
||||||
|
for i = 1, #nodes:get(query_selectors[1]) do
|
||||||
|
local node_list: { string : scan.HTMLNode | nil } = {}
|
||||||
|
tablex.foreach(query_selectors, function(query_selector: string)
|
||||||
|
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
||||||
|
end)
|
||||||
|
local success, info_or_error = pcall(extract_callback, node_list)
|
||||||
|
|
||||||
|
if not success then
|
||||||
|
local error_message = info_or_error as string
|
||||||
|
log:error { message = error_message }
|
||||||
|
else
|
||||||
|
local info = info_or_error as T
|
||||||
|
table.insert(ret, info)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return ret
|
||||||
|
end
|
||||||
|
|
||||||
|
return scraper_utils
|
|
@ -1,70 +0,0 @@
|
||||||
local web_sanitize = require "web_sanitize"
|
|
||||||
|
|
||||||
local utils = {}
|
|
||||||
|
|
||||||
function utils.has_item(table, item)
|
|
||||||
for k, v in pairs(table) do
|
|
||||||
if v == item then
|
|
||||||
return k
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return nil
|
|
||||||
end
|
|
||||||
|
|
||||||
function utils.filter(list, predicate)
|
|
||||||
local filtered = {}
|
|
||||||
|
|
||||||
for position, value in ipairs(list) do
|
|
||||||
if predicate(value, position) then
|
|
||||||
table.insert(filtered, value)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return filtered
|
|
||||||
end
|
|
||||||
|
|
||||||
function utils.map(list, iteratee)
|
|
||||||
local mapped = {}
|
|
||||||
|
|
||||||
for position, value in ipairs(list) do
|
|
||||||
table.insert(mapped, iteratee(value, position))
|
|
||||||
end
|
|
||||||
|
|
||||||
return mapped
|
|
||||||
end
|
|
||||||
|
|
||||||
function utils.sanitize_string(string)
|
|
||||||
return utils.trim(
|
|
||||||
utils.replace(web_sanitize.extract_text(string), "^%s*(.-)%s*$", "%1")
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
-- Extracted from teh Penlight Lua library.
|
|
||||||
-- Sometime Lua string.gsub can't match unescaped strings.
|
|
||||||
-- https://stackoverflow.com/a/72666170
|
|
||||||
function utils.escape(string)
|
|
||||||
return (string:gsub("[%-%.%+%[%]%(%)%$%^%%%?%*]", "%%%1"))
|
|
||||||
end
|
|
||||||
|
|
||||||
function utils.replace(string, old, new, n)
|
|
||||||
return (string:gsub(utils.escape(old), new:gsub("%%", "%%%%"), n))
|
|
||||||
end
|
|
||||||
|
|
||||||
function utils.trim(string)
|
|
||||||
return string:match "^%s*(.-)%s*$"
|
|
||||||
end
|
|
||||||
|
|
||||||
function utils.do_or_fail(func, ...)
|
|
||||||
local log = require "logger"
|
|
||||||
local res, err = func(...)
|
|
||||||
|
|
||||||
if not res then
|
|
||||||
log:error { "do_or_fail failed!", error = err }
|
|
||||||
error(err)
|
|
||||||
end
|
|
||||||
|
|
||||||
return res
|
|
||||||
end
|
|
||||||
|
|
||||||
return utils
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
local web_sanitize = require "web_sanitize"
|
||||||
|
|
||||||
|
local utils = {}
|
||||||
|
|
||||||
|
function utils.has_item(t: table, item: any): any
|
||||||
|
for k, v in pairs(t) do
|
||||||
|
if v == item then
|
||||||
|
return k
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
function utils.filter<T>(list: { T }, predicate: function(value: T, position: integer): boolean): { T }
|
||||||
|
local filtered: { T } = {}
|
||||||
|
|
||||||
|
for position, value in ipairs(list) do
|
||||||
|
if predicate(value, position) then
|
||||||
|
table.insert(filtered, value)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
end
|
||||||
|
|
||||||
|
function utils.map<T, U>(list: { T }, iteratee: function(value: T, position: integer): U): { U }
|
||||||
|
local mapped: { U } = {}
|
||||||
|
|
||||||
|
for position, value in ipairs(list) do
|
||||||
|
table.insert(mapped, iteratee(value, position))
|
||||||
|
end
|
||||||
|
|
||||||
|
return mapped
|
||||||
|
end
|
||||||
|
|
||||||
|
-- Extracted from teh Penlight Lua library.
|
||||||
|
-- Sometime Lua string.gsub can't match unescaped strings.
|
||||||
|
-- https://stackoverflow.com/a/72666170
|
||||||
|
function utils.escape(s: string): string
|
||||||
|
return (s:gsub("[%-%.%+%[%]%(%)%$%^%%%?%*]", "%%%1"))
|
||||||
|
end
|
||||||
|
|
||||||
|
function utils.replace(s: string, old: string, new: string, n: number): string
|
||||||
|
return (s:gsub(utils.escape(old), new:gsub("%%", "%%%%"), n))
|
||||||
|
end
|
||||||
|
|
||||||
|
function utils.trim(s: string): string
|
||||||
|
return s:match "^%s*(.-)%s*$"
|
||||||
|
end
|
||||||
|
|
||||||
|
function utils.sanitize_string(s: string): string
|
||||||
|
return utils.trim(
|
||||||
|
utils.replace(web_sanitize.extract_text(s), "^%s*(.-)%s*$", "%1")
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
-- At some point, we should probably write a wrapper to make penlight's function work with pcalls.
|
||||||
|
local type Func = function<T>(...: any): T | nil, string
|
||||||
|
function utils.do_or_fail<T>(func: Func<T>, ...: any): T
|
||||||
|
local log = require "logger"
|
||||||
|
local res, err = func(...)
|
||||||
|
|
||||||
|
if not res then
|
||||||
|
log:error { "do_or_fail failed!", error = err }
|
||||||
|
error(err)
|
||||||
|
end
|
||||||
|
|
||||||
|
return res
|
||||||
|
end
|
||||||
|
|
||||||
|
return utils
|
Loading…
Reference in New Issue