feat: move sources to Teal 🚀
This commit is contained in:
parent
978ac20387
commit
483cfa6a6f
|
@ -6,7 +6,7 @@ local tablex = require "pl.tablex"
|
|||
|
||||
local crawler = {}
|
||||
|
||||
local function http_request(url)
|
||||
local function http_request(url: string): string
|
||||
local body, code_or_error = http.request(url)
|
||||
|
||||
if not body then
|
||||
|
@ -14,7 +14,7 @@ local function http_request(url)
|
|||
error { "Request failed", err = err, url = url }
|
||||
end
|
||||
|
||||
local code = code_or_error
|
||||
local code = code_or_error as integer
|
||||
|
||||
if code < 200 and code >= 300 then
|
||||
error {
|
||||
|
@ -27,7 +27,7 @@ local function http_request(url)
|
|||
return body
|
||||
end
|
||||
|
||||
local function get_resource_by_protocol(url)
|
||||
local function get_resource_by_protocol(url: string): boolean, string
|
||||
local protocol, resource = url:match "^(%a+)://(.*)$"
|
||||
|
||||
if not not tablex.find({ "http", "https" }, protocol) then
|
||||
|
@ -45,7 +45,7 @@ local function get_resource_by_protocol(url)
|
|||
end
|
||||
end
|
||||
|
||||
function crawler.fetch(url)
|
||||
function crawler.fetch(url: string): string
|
||||
local success, result = get_resource_by_protocol(url)
|
||||
|
||||
if not success then
|
|
@ -1,23 +0,0 @@
|
|||
local class = require "pl.class"
|
||||
local List = require "pl.List"
|
||||
|
||||
local Function_Info = class.Module_Doc()
|
||||
|
||||
function Function_Info:_init()
|
||||
self.name = ""
|
||||
self.parameters = List()
|
||||
self.return_types = List()
|
||||
end
|
||||
|
||||
function Function_Info:append_return_type(return_type)
|
||||
self.return_types:append(return_type)
|
||||
end
|
||||
|
||||
function Function_Info:append_parameter(name, type)
|
||||
self.parameters:append {
|
||||
name = name,
|
||||
type = type,
|
||||
}
|
||||
end
|
||||
|
||||
return Function_Info
|
|
@ -0,0 +1,30 @@
|
|||
local List = require "pl.List"
|
||||
|
||||
local record Parameter
|
||||
name: string
|
||||
type: string
|
||||
end
|
||||
|
||||
local record Function_Info
|
||||
metamethod __call: function(Function_Info): Function_Info
|
||||
|
||||
name: string
|
||||
parameters: List<Parameter>
|
||||
return_types: List<string>
|
||||
|
||||
append_parameter: function(self: Function_Info, parameter: string)
|
||||
append_return_type: function(self: Function_Info, return_type: string)
|
||||
end
|
||||
|
||||
function Function_Info:append_parameter(name: string, type: string)
|
||||
self.parameters:append {
|
||||
name = name,
|
||||
type = type,
|
||||
}
|
||||
end
|
||||
|
||||
function Function_Info:append_return_type(return_type: string)
|
||||
self.return_types:append(return_type)
|
||||
end
|
||||
|
||||
return Function_Info
|
|
@ -1,12 +0,0 @@
|
|||
local class = require "pl.class"
|
||||
local List = require "pl.List"
|
||||
|
||||
local Module_Doc = class.Module_Doc()
|
||||
|
||||
function Module_Doc:_init()
|
||||
self.constructors = List()
|
||||
self.methods = List()
|
||||
self.static_functions = List()
|
||||
end
|
||||
|
||||
return Module_Doc
|
|
@ -0,0 +1,12 @@
|
|||
local Function_Info = require "entities.Function_Info"
|
||||
local List = require "pl.List"
|
||||
|
||||
local record Module_Doc
|
||||
metamethod __call: function(Module_Doc): Module_Doc
|
||||
|
||||
constructors: List<Function_Info>
|
||||
methods: List<Function_Info>
|
||||
static_functions: List<Function_Info>
|
||||
end
|
||||
|
||||
return Module_Doc
|
|
@ -1,10 +0,0 @@
|
|||
local class = require "pl.class"
|
||||
|
||||
local Module_Info = class.Module_Info()
|
||||
|
||||
function Module_Info:_init(name, uri)
|
||||
self.name = name
|
||||
self.uri = uri
|
||||
end
|
||||
|
||||
return Module_Info
|
|
@ -0,0 +1,8 @@
|
|||
local record Module_Info
|
||||
metamethod __call: function(Module_Info, name: string, uri: string): Module_Info
|
||||
|
||||
name: string
|
||||
uri: string
|
||||
end
|
||||
|
||||
return Module_Info
|
|
@ -5,7 +5,7 @@ local log = require "logger"
|
|||
local utils = require "utils"
|
||||
local snippets = require "generator.snippets"
|
||||
|
||||
local tmpl = (function(mod)
|
||||
local tmpl = (function(mod: string): string
|
||||
local package_path = utils.do_or_fail(path.package_path, mod)
|
||||
local package_dir = path.dirname(package_path)
|
||||
return utils.do_or_fail(file.read, package_dir .. "/template.tl.tmpl", false)
|
||||
|
@ -13,10 +13,22 @@ end)(...)
|
|||
|
||||
local generator = {}
|
||||
|
||||
function generator.generate_teal(data)
|
||||
local record Generate_Teal_Data_Record
|
||||
section: string
|
||||
items: { snippets.Anonymous_Function_Record }
|
||||
end
|
||||
function generator.generate_teal(data: { Generate_Teal_Data_Record }): string
|
||||
-- TODO : add the required modules to the generated code
|
||||
-- TODO : replace this with a proper way to get the module name (will also probably need the module path)
|
||||
local module_data = { name = "module_name" }
|
||||
local record Module_Data_Record
|
||||
name: string
|
||||
static_functions: { snippets.Anonymous_Function_Record }
|
||||
constructors: { snippets.Anonymous_Function_Record }
|
||||
methods: { string }
|
||||
properties: { snippets.Anonymous_Function_Record }
|
||||
signals: { snippets.Anonymous_Function_Record }
|
||||
end
|
||||
local module_data: Module_Data_Record = { name = "module_name" }
|
||||
for _, item in ipairs(data) do
|
||||
if item.section == "Static functions" then
|
||||
-- TODO
|
||||
|
@ -49,19 +61,19 @@ function generator.generate_teal(data)
|
|||
return utils.do_or_fail(template.substitute, tmpl, env)
|
||||
end
|
||||
|
||||
function generator.write(file_content, file_path)
|
||||
function generator.write(file_content:string, file_path: string)
|
||||
-- Make sure the directory we want to write the file to exists
|
||||
local directory = path.dirname(file_path)
|
||||
if not path.isdir(directory) then
|
||||
path.mkdir(directory)
|
||||
end
|
||||
|
||||
local success, error = file.write(file_path, file_content, false)
|
||||
local success, error_message = file.write(file_path, file_content, false)
|
||||
|
||||
if not success then
|
||||
log:error {
|
||||
"generator.write error",
|
||||
error = error,
|
||||
error = error_message,
|
||||
}
|
||||
return
|
||||
end
|
|
@ -2,10 +2,10 @@ local utils = require "utils"
|
|||
local template = require "pl.template"
|
||||
|
||||
-- Refactor scraper code to use pl.List objects
|
||||
local function join(arr, delim)
|
||||
local function join<T>(arr: { string }, delim: string): string
|
||||
local ret = ""
|
||||
for i, type in ipairs(arr) do
|
||||
ret = ret .. type
|
||||
for i, t in ipairs(arr) do
|
||||
ret = ret .. t
|
||||
if i < #arr then
|
||||
ret = ret .. delim
|
||||
end
|
||||
|
@ -13,13 +13,27 @@ local function join(arr, delim)
|
|||
return ret
|
||||
end
|
||||
|
||||
local snippets = {}
|
||||
local record Anonymous_Function_Parameter_Record
|
||||
name: string
|
||||
type: { string }
|
||||
end
|
||||
|
||||
function snippets.types_list(types)
|
||||
local record Anonymous_Function_Record
|
||||
name: string
|
||||
parameters: { Anonymous_Function_Parameter_Record }
|
||||
returns: { string }
|
||||
end
|
||||
|
||||
local snippets = {
|
||||
Anonymous_Function_Parameter_Record = Anonymous_Function_Parameter_Record,
|
||||
Anonymous_Function_Record = Anonymous_Function_Record,
|
||||
}
|
||||
|
||||
function snippets.types_list(types: { string }): string
|
||||
return join(types, ", ")
|
||||
end
|
||||
|
||||
function snippets.anonymous_function(item)
|
||||
function snippets.anonymous_function(item: Anonymous_Function_Record): string
|
||||
local parameters_string = ""
|
||||
if item.parameters then
|
||||
for i, param in ipairs(item.parameters) do
|
|
@ -1,18 +0,0 @@
|
|||
local ansicolors = require "ansicolors"
|
||||
local console = require "logging.console"
|
||||
local ll = require "logging"
|
||||
|
||||
local log = console {
|
||||
logLevel = ll.DEBUG,
|
||||
destination = "stdout",
|
||||
timestampPattern = "[%y-%m-%d %H:%M:%S]",
|
||||
logPatterns = {
|
||||
[ll.DEBUG] = ansicolors "%date%{cyan} %level %message %{reset}(%source)\n",
|
||||
[ll.INFO] = ansicolors "%date %level %message\n",
|
||||
[ll.WARN] = ansicolors "%date%{yellow} %level %message\n",
|
||||
[ll.ERROR] = ansicolors "%date%{red bright} %level %message %{reset}(%source)\n",
|
||||
[ll.FATAL] = ansicolors "%date%{magenta bright} %level %message %{reset}(%source)\n",
|
||||
},
|
||||
}
|
||||
|
||||
return log
|
|
@ -0,0 +1,17 @@
|
|||
local ansicolors = require "ansicolors"
|
||||
local logging_console = require "logging.console"
|
||||
|
||||
local log = logging_console {
|
||||
logLevel = "DEBUG",
|
||||
destination = "stdout",
|
||||
timestampPattern = "[%y-%m-%d %H:%M:%S]",
|
||||
logPatterns = {
|
||||
DEBUG = ansicolors "%date%{cyan} %level %message %{reset}(%source)\n",
|
||||
INFO = ansicolors "%date %level %message\n",
|
||||
WARN = ansicolors "%date%{yellow} %level %message\n",
|
||||
ERROR = ansicolors "%date%{red bright} %level %message %{reset}(%source)\n",
|
||||
FATAL = ansicolors "%date%{magenta bright} %level %message %{reset}(%source)\n",
|
||||
},
|
||||
}
|
||||
|
||||
return log
|
|
@ -1,66 +0,0 @@
|
|||
local properties = {}
|
||||
|
||||
-- properties.base_url = "https://awesomewm.org/apidoc"
|
||||
properties.base_url = "file:///usr/share/doc/awesome/doc"
|
||||
|
||||
properties.index_uri = "/index.html"
|
||||
|
||||
properties.out_directory = "generated"
|
||||
|
||||
--- Pages from the navigation menu to ignore.
|
||||
-- Sets to ignore documentations and sample file. I also added libraries with
|
||||
-- low quality API documentation, I'll probably work on them later, lets start
|
||||
-- with what works the best first.
|
||||
properties.ignored_modules = {
|
||||
-- Sample files
|
||||
"rc.lua",
|
||||
"theme.lua",
|
||||
|
||||
-- Utility libraries
|
||||
"gears.debug",
|
||||
"gears.filesystem",
|
||||
"gears.geometry",
|
||||
"gears.math",
|
||||
"gears.object",
|
||||
"gears.protected_call",
|
||||
"gears.sort",
|
||||
"gears.string",
|
||||
"gears.table",
|
||||
"gears.wallpaper",
|
||||
|
||||
-- Theme related libraries
|
||||
"beautiful",
|
||||
"gears.color",
|
||||
"gears.shape",
|
||||
|
||||
-- Classes
|
||||
"awful.widget.common",
|
||||
"gears.cache",
|
||||
"gears.matrix",
|
||||
"menubar.icon_theme",
|
||||
"menubar.index_theme",
|
||||
"signals",
|
||||
"wibox.drawable",
|
||||
"wibox.hierarchy",
|
||||
"wibox.widget.base",
|
||||
"xproperties",
|
||||
|
||||
-- Documentation
|
||||
"Authors",
|
||||
"Readme",
|
||||
"Contributing",
|
||||
"The Widget system",
|
||||
"Creating new widget",
|
||||
"Default configuration file documentation",
|
||||
"Change Awesome appearance",
|
||||
"My first Awesome",
|
||||
"The AwesomeWM client layout system",
|
||||
"Startup options",
|
||||
"Building and Testing",
|
||||
"Using Cairo and LGI",
|
||||
"Tips for upgrading your configuration",
|
||||
"NEWS",
|
||||
"FAQ",
|
||||
}
|
||||
|
||||
return properties
|
|
@ -0,0 +1,72 @@
|
|||
local record Properties
|
||||
base_url: string
|
||||
index_uri: string
|
||||
|
||||
out_directory: string
|
||||
|
||||
--- Pages from the navigation menu to ignore.
|
||||
-- Sets to ignore documentations and sample file. I also added libraries with
|
||||
-- low quality API documentation, I'll probably work on them later, lets start
|
||||
-- with what works the best first.
|
||||
ignored_modules: { string }
|
||||
end
|
||||
|
||||
local properties: Properties = {
|
||||
-- base_url = "https://awesomewm.org/apidoc",
|
||||
base_url = "file:///usr/share/doc/awesome/doc",
|
||||
index_uri = "/index.html",
|
||||
out_directory = "generated",
|
||||
ignored_modules = {
|
||||
-- Sample files
|
||||
"rc.lua",
|
||||
"theme.lua",
|
||||
|
||||
-- Utility libraries
|
||||
"gears.debug",
|
||||
"gears.filesystem",
|
||||
"gears.geometry",
|
||||
"gears.math",
|
||||
"gears.object",
|
||||
"gears.protected_call",
|
||||
"gears.sort",
|
||||
"gears.string",
|
||||
"gears.table",
|
||||
"gears.wallpaper",
|
||||
|
||||
-- Theme related libraries
|
||||
"beautiful",
|
||||
"gears.color",
|
||||
"gears.shape",
|
||||
|
||||
-- Classes
|
||||
"awful.widget.common",
|
||||
"gears.cache",
|
||||
"gears.matrix",
|
||||
"menubar.icon_theme",
|
||||
"menubar.index_theme",
|
||||
"signals",
|
||||
"wibox.drawable",
|
||||
"wibox.hierarchy",
|
||||
"wibox.widget.base",
|
||||
"xproperties",
|
||||
|
||||
-- Documentation
|
||||
"Authors",
|
||||
"Readme",
|
||||
"Contributing",
|
||||
"The Widget system",
|
||||
"Creating new widget",
|
||||
"Default configuration file documentation",
|
||||
"Change Awesome appearance",
|
||||
"My first Awesome",
|
||||
"The AwesomeWM client layout system",
|
||||
"Startup options",
|
||||
"Building and Testing",
|
||||
"Using Cairo and LGI",
|
||||
"Tips for upgrading your configuration",
|
||||
"NEWS",
|
||||
"FAQ",
|
||||
}
|
||||
}
|
||||
|
||||
return properties
|
|
@ -1,13 +1,15 @@
|
|||
local Function_Info = require "entities.Function_Info"
|
||||
local List = require "pl.List"
|
||||
local Module_Doc = require "entities.Module_Doc"
|
||||
local scan = require "web_sanitize.query.scan_html"
|
||||
local scraper_utils = require "scraper.utils"
|
||||
local utils = require "utils"
|
||||
|
||||
local function extract_function_name(function_name_node)
|
||||
return function_name_node and (function_name_node.attr.name:gsub(".*:", ""))
|
||||
local function extract_function_name(function_name_node: scan.HTMLNode): string
|
||||
return function_name_node and ((function_name_node.attr.name as string):gsub(".*:", ""))
|
||||
end
|
||||
|
||||
local function extract_function_return_types(function_return_types_node)
|
||||
local function extract_function_return_types(function_return_types_node: scan.HTMLNode): { string }
|
||||
if not function_return_types_node then
|
||||
return {}
|
||||
end
|
||||
|
@ -15,12 +17,12 @@ local function extract_function_return_types(function_return_types_node)
|
|||
local selector = "span.types .type"
|
||||
local html = function_return_types_node:outer_html()
|
||||
|
||||
return scraper_utils.scrape(html, selector, function(node)
|
||||
return scraper_utils.scrape(html, selector, function(node: scan.HTMLNode): string
|
||||
return utils.sanitize_string(node:inner_text())
|
||||
end)
|
||||
end
|
||||
|
||||
local function extract_section_functions(dl)
|
||||
local function extract_section_functions(dl: string): { Function_Info }
|
||||
local query_selectors = {
|
||||
function_name = "dt a",
|
||||
function_return_type = "dd ol",
|
||||
|
@ -29,14 +31,16 @@ local function extract_section_functions(dl)
|
|||
return scraper_utils.scrape_tuples(
|
||||
dl,
|
||||
{ query_selectors.function_name, query_selectors.function_return_type },
|
||||
function(nodes)
|
||||
function(nodes: { string : scan.HTMLNode | nil }): Function_Info
|
||||
local function_info = Function_Info()
|
||||
|
||||
function_info.name =
|
||||
extract_function_name(nodes[query_selectors.function_name])
|
||||
function_info.return_types = extract_function_return_types(
|
||||
function_info.return_types = List(
|
||||
extract_function_return_types(
|
||||
nodes[query_selectors.function_return_type]
|
||||
)
|
||||
)
|
||||
|
||||
return function_info
|
||||
end
|
||||
|
@ -45,7 +49,7 @@ end
|
|||
|
||||
local module = {}
|
||||
|
||||
function module.get_doc_from_page(html)
|
||||
function module.get_doc_from_page(html: string): Module_Doc
|
||||
local nodes = scraper_utils.extract_nodes(html, {
|
||||
"h2.section-header",
|
||||
"dl.function",
|
||||
|
@ -57,20 +61,21 @@ function module.get_doc_from_page(html)
|
|||
|
||||
local module_doc = Module_Doc()
|
||||
|
||||
for i, h2 in ipairs(nodes:get "h2.section-header") do
|
||||
for i = 1, #nodes:get("h2.section-header") do
|
||||
local h2 = nodes:get("h2.section-header")[i]
|
||||
local section_name = utils.sanitize_string(h2:inner_text())
|
||||
local dl_html = nodes:get("dl.function")[i]:outer_html()
|
||||
|
||||
if section_name == "Constructors" then
|
||||
module_doc.constructors = extract_section_functions(dl_html)
|
||||
module_doc.constructors = List(extract_section_functions(dl_html))
|
||||
elseif section_name == "Static module functions" then
|
||||
module_doc.static_functions = extract_section_functions(dl_html)
|
||||
module_doc.static_functions = List(extract_section_functions(dl_html))
|
||||
elseif section_name == "Object properties" then
|
||||
print "Not implemented: Deprecated object properties"
|
||||
elseif section_name == "Deprecated object properties" then
|
||||
print "Not implemented: Deprecated object properties"
|
||||
elseif section_name == "Object methods" then
|
||||
module_doc.methods = extract_section_functions(dl_html)
|
||||
module_doc.methods = List(extract_section_functions(dl_html))
|
||||
elseif section_name == "Signals" then
|
||||
print "Not implemented: Signals"
|
||||
else
|
|
@ -1,4 +1,5 @@
|
|||
local Module_Info = require "entities.Module_Info"
|
||||
local scan = require "web_sanitize.query.scan_html"
|
||||
local scraper_utils = require "scraper.utils"
|
||||
local utils = require "utils"
|
||||
|
||||
|
@ -6,9 +7,9 @@ local module = {}
|
|||
|
||||
local MODULE_A_TAG_QUERY_SELECTOR = "div#navigation ul li a"
|
||||
|
||||
local function extract_module_info(node)
|
||||
local function extract_module_info(node: scan.HTMLNode): Module_Info
|
||||
local name = utils.sanitize_string(node:inner_text())
|
||||
local uri = node.attr.href
|
||||
local uri = node.attr.href as string
|
||||
|
||||
if not (name and uri) then
|
||||
error("Can't extract module info from node: " .. node:outer_html())
|
||||
|
@ -17,7 +18,7 @@ local function extract_module_info(node)
|
|||
return Module_Info(name, uri)
|
||||
end
|
||||
|
||||
function module.get_modules_from_index(html)
|
||||
function module.get_modules_from_index(html: string): { Module_Info }
|
||||
return scraper_utils.scrape(
|
||||
html,
|
||||
MODULE_A_TAG_QUERY_SELECTOR,
|
|
@ -1,68 +0,0 @@
|
|||
local List = require "pl.List"
|
||||
local log = require "logger"
|
||||
local Map = require "pl.Map"
|
||||
local scanner = require "web_sanitize.query.scan_html"
|
||||
local tablex = require "pl.tablex"
|
||||
|
||||
local scraper_utils = {}
|
||||
|
||||
function scraper_utils.scrape(html, query_selector, extract_callback)
|
||||
local ret = {}
|
||||
|
||||
scanner.scan_html(html, function(stack)
|
||||
if stack:is(query_selector) then
|
||||
local node = stack:current()
|
||||
local success, info = pcall(extract_callback, node)
|
||||
|
||||
if not success then
|
||||
log:error { message = info }
|
||||
else
|
||||
table.insert(ret, info)
|
||||
end
|
||||
end
|
||||
end)
|
||||
|
||||
return ret
|
||||
end
|
||||
|
||||
function scraper_utils.extract_nodes(html, query_selectors)
|
||||
local siblings = Map()
|
||||
|
||||
tablex.foreach(query_selectors, function(query_selector)
|
||||
siblings:set(query_selector, List())
|
||||
end)
|
||||
|
||||
scanner.scan_html(html, function(stack)
|
||||
tablex.foreach(query_selectors, function(query_selector)
|
||||
if stack:is(query_selector) then
|
||||
siblings:get(query_selector):append(stack:current())
|
||||
end
|
||||
end)
|
||||
end)
|
||||
|
||||
return siblings
|
||||
end
|
||||
|
||||
function scraper_utils.scrape_tuples(html, query_selectors, extract_callback)
|
||||
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
||||
|
||||
local ret = {}
|
||||
|
||||
for i = 1, #nodes:get(query_selectors[1]) do
|
||||
local node_list = {}
|
||||
tablex.foreach(query_selectors, function(query_selector)
|
||||
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
||||
end)
|
||||
local success, info = pcall(extract_callback, node_list)
|
||||
|
||||
if not success then
|
||||
log:error { message = info }
|
||||
else
|
||||
table.insert(ret, info)
|
||||
end
|
||||
end
|
||||
|
||||
return ret
|
||||
end
|
||||
|
||||
return scraper_utils
|
|
@ -0,0 +1,73 @@
|
|||
local List = require "pl.List"
|
||||
local log = require "logger"
|
||||
local Map = require "pl.Map"
|
||||
local scan = require "web_sanitize.query.scan_html"
|
||||
local scanner = require "web_sanitize.query.scan_html"
|
||||
local tablex = require "pl.tablex"
|
||||
|
||||
local scraper_utils = {}
|
||||
|
||||
function scraper_utils.scrape<T>(html: string, query_selector: string, extract_callback: function(node: scan.HTMLNode): T): { T }
|
||||
local ret: { T } = {}
|
||||
|
||||
scanner.scan_html(html, function(stack: scan.NodeStack)
|
||||
if stack:is(query_selector) then
|
||||
local node = stack:current()
|
||||
local success, info_or_error = pcall(extract_callback, node)
|
||||
|
||||
if not success then
|
||||
local error_message = info_or_error as string
|
||||
log:error { message = error_message }
|
||||
else
|
||||
local info = info_or_error as T
|
||||
table.insert(ret, info)
|
||||
end
|
||||
end
|
||||
end)
|
||||
|
||||
return ret
|
||||
end
|
||||
|
||||
function scraper_utils.extract_nodes(html: string, query_selectors: { string }): Map<string, List<scan.HTMLNode>>
|
||||
local siblings: Map<string, List<scan.HTMLNode>> = Map()
|
||||
|
||||
tablex.foreach(query_selectors, function(query_selector: string)
|
||||
siblings:set(query_selector, List())
|
||||
end)
|
||||
|
||||
scanner.scan_html(html, function(stack: scan.NodeStack)
|
||||
tablex.foreach(query_selectors, function(query_selector: string)
|
||||
if stack:is(query_selector) then
|
||||
siblings:get(query_selector):append(stack:current())
|
||||
end
|
||||
end)
|
||||
end)
|
||||
|
||||
return siblings
|
||||
end
|
||||
|
||||
function scraper_utils.scrape_tuples<T>(html: string, query_selectors: { string }, extract_callback: function(tuple: { string : scan.HTMLNode | nil }): T): { T }
|
||||
local nodes = scraper_utils.extract_nodes(html, query_selectors)
|
||||
|
||||
local ret: { T } = {}
|
||||
|
||||
for i = 1, #nodes:get(query_selectors[1]) do
|
||||
local node_list: { string : scan.HTMLNode | nil } = {}
|
||||
tablex.foreach(query_selectors, function(query_selector: string)
|
||||
node_list[query_selector] = nodes:get(query_selector)[i] or nil
|
||||
end)
|
||||
local success, info_or_error = pcall(extract_callback, node_list)
|
||||
|
||||
if not success then
|
||||
local error_message = info_or_error as string
|
||||
log:error { message = error_message }
|
||||
else
|
||||
local info = info_or_error as T
|
||||
table.insert(ret, info)
|
||||
end
|
||||
end
|
||||
|
||||
return ret
|
||||
end
|
||||
|
||||
return scraper_utils
|
|
@ -1,70 +0,0 @@
|
|||
local web_sanitize = require "web_sanitize"
|
||||
|
||||
local utils = {}
|
||||
|
||||
function utils.has_item(table, item)
|
||||
for k, v in pairs(table) do
|
||||
if v == item then
|
||||
return k
|
||||
end
|
||||
end
|
||||
|
||||
return nil
|
||||
end
|
||||
|
||||
function utils.filter(list, predicate)
|
||||
local filtered = {}
|
||||
|
||||
for position, value in ipairs(list) do
|
||||
if predicate(value, position) then
|
||||
table.insert(filtered, value)
|
||||
end
|
||||
end
|
||||
|
||||
return filtered
|
||||
end
|
||||
|
||||
function utils.map(list, iteratee)
|
||||
local mapped = {}
|
||||
|
||||
for position, value in ipairs(list) do
|
||||
table.insert(mapped, iteratee(value, position))
|
||||
end
|
||||
|
||||
return mapped
|
||||
end
|
||||
|
||||
function utils.sanitize_string(string)
|
||||
return utils.trim(
|
||||
utils.replace(web_sanitize.extract_text(string), "^%s*(.-)%s*$", "%1")
|
||||
)
|
||||
end
|
||||
|
||||
-- Extracted from teh Penlight Lua library.
|
||||
-- Sometime Lua string.gsub can't match unescaped strings.
|
||||
-- https://stackoverflow.com/a/72666170
|
||||
function utils.escape(string)
|
||||
return (string:gsub("[%-%.%+%[%]%(%)%$%^%%%?%*]", "%%%1"))
|
||||
end
|
||||
|
||||
function utils.replace(string, old, new, n)
|
||||
return (string:gsub(utils.escape(old), new:gsub("%%", "%%%%"), n))
|
||||
end
|
||||
|
||||
function utils.trim(string)
|
||||
return string:match "^%s*(.-)%s*$"
|
||||
end
|
||||
|
||||
function utils.do_or_fail(func, ...)
|
||||
local log = require "logger"
|
||||
local res, err = func(...)
|
||||
|
||||
if not res then
|
||||
log:error { "do_or_fail failed!", error = err }
|
||||
error(err)
|
||||
end
|
||||
|
||||
return res
|
||||
end
|
||||
|
||||
return utils
|
|
@ -0,0 +1,72 @@
|
|||
local web_sanitize = require "web_sanitize"
|
||||
|
||||
local utils = {}
|
||||
|
||||
function utils.has_item(t: table, item: any): any
|
||||
for k, v in pairs(t) do
|
||||
if v == item then
|
||||
return k
|
||||
end
|
||||
end
|
||||
|
||||
return nil
|
||||
end
|
||||
|
||||
function utils.filter<T>(list: { T }, predicate: function(value: T, position: integer): boolean): { T }
|
||||
local filtered: { T } = {}
|
||||
|
||||
for position, value in ipairs(list) do
|
||||
if predicate(value, position) then
|
||||
table.insert(filtered, value)
|
||||
end
|
||||
end
|
||||
|
||||
return filtered
|
||||
end
|
||||
|
||||
function utils.map<T, U>(list: { T }, iteratee: function(value: T, position: integer): U): { U }
|
||||
local mapped: { U } = {}
|
||||
|
||||
for position, value in ipairs(list) do
|
||||
table.insert(mapped, iteratee(value, position))
|
||||
end
|
||||
|
||||
return mapped
|
||||
end
|
||||
|
||||
-- Extracted from teh Penlight Lua library.
|
||||
-- Sometime Lua string.gsub can't match unescaped strings.
|
||||
-- https://stackoverflow.com/a/72666170
|
||||
function utils.escape(s: string): string
|
||||
return (s:gsub("[%-%.%+%[%]%(%)%$%^%%%?%*]", "%%%1"))
|
||||
end
|
||||
|
||||
function utils.replace(s: string, old: string, new: string, n: number): string
|
||||
return (s:gsub(utils.escape(old), new:gsub("%%", "%%%%"), n))
|
||||
end
|
||||
|
||||
function utils.trim(s: string): string
|
||||
return s:match "^%s*(.-)%s*$"
|
||||
end
|
||||
|
||||
function utils.sanitize_string(s: string): string
|
||||
return utils.trim(
|
||||
utils.replace(web_sanitize.extract_text(s), "^%s*(.-)%s*$", "%1")
|
||||
)
|
||||
end
|
||||
|
||||
-- At some point, we should probably write a wrapper to make penlight's function work with pcalls.
|
||||
local type Func = function<T>(...: any): T | nil, string
|
||||
function utils.do_or_fail<T>(func: Func<T>, ...: any): T
|
||||
local log = require "logger"
|
||||
local res, err = func(...)
|
||||
|
||||
if not res then
|
||||
log:error { "do_or_fail failed!", error = err }
|
||||
error(err)
|
||||
end
|
||||
|
||||
return res
|
||||
end
|
||||
|
||||
return utils
|
Loading…
Reference in New Issue