LDoc2tl/ldoc/lexer.lua

--- Lexical scanner for creating a sequence of tokens from text. <br>
-- <p><code>lexer.scan(s)</code> returns an iterator over all tokens found in the
-- string <code>s</code>. This iterator returns two values, a token type string
-- (such as 'string' for quoted string, 'iden' for identifier) and the value of the
-- token.
-- <p>
-- Versions specialized for Lua and C are available; these also handle block comments
-- and classify keywords as 'keyword' tokens. For example:
-- <pre class=example>
-- > s = 'for i=1,n do'
-- > for t,v in lexer.lua(s)  do print(t,v) end
-- keyword for
-- iden    i
-- =       =
-- number  1
-- ,       ,
-- iden    n
-- keyword do
-- </pre>
-- See the Guide for further <a href="../../index.html#lexer">discussion</a> <br>
-- @class module
-- @name pl.lexer

local strfind = string.find
local strsub = string.sub
local append = table.insert
--[[
module ('pl.lexer',utils._module)
]]

local function assert_arg(idx,val,tp)
    if type(val) ~= tp then
        error("argument "..idx.." must be "..tp, 2)
    end
end

local lexer = {}

local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+'
local NUMBER2 = '^[%+%-]?%d+%.?%d*'
local NUMBER3 = '^0x[%da-fA-F]+'
local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+'
local NUMBER5 = '^%d+%.?%d*'
local IDEN = '^[%a_][%w_]*'
local WSPACE = '^%s+'
local STRING1 = [[^'.-[^\\]']]
local STRING2 = [[^".-[^\\]"]]
local STRING3 = "^((['\"])%2)" -- empty string
local PREPRO = '^#.-[^\\]\n'

local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword

local function tdump(tok)
    return tok,tok
end

local function ndump(tok,options)
    if options and options.number then
        tok = tonumber(tok)
    end
    return "number",tok
end

-- regular strings, single or double quotes; usually we want them
-- without the quotes
local function sdump(tok,options)
    if options and options.string then
        tok = tok:sub(2,-2)
    end
    return "string",tok
end

-- long Lua strings need extra work to get rid of the quotes
local function sdump_l(tok,options)
    if options and options.string then
        tok = tok:sub(3,-3)
    end
    return "string",tok
end

local function chdump(tok,options)
    if options and options.string then
        tok = tok:sub(2,-2)
    end
    return "char",tok
end

local function cdump(tok)
    return 'comment',tok
end

local function wsdump (tok)
    return "space",tok
end

local function pdump (tok)
    return 'prepro',tok
end

local function plain_vdump(tok)
    return "iden",tok
end

local function lua_vdump(tok)
    if lua_keyword[tok] then
        return "keyword",tok
    else
        return "iden",tok
    end
end

local function cpp_vdump(tok)
    if cpp_keyword[tok] then
        return "keyword",tok
    else
        return "iden",tok
    end
end

local function count_lines(line, text)
   local index, limit = 1, #text
   while index <= limit do
      local start, stop = text:find('\r\n', index, true)
      if not start then
         start, stop = text:find('[\r\n\f]', index)
         if not start then break end
      end
      index = stop + 1
      line = line + 1
   end
   return line
end

local multiline = { comment = true, space = true }


--- create a plain token iterator from a string or file-like object.
-- @param s the string
-- @param matches an optional match table (set of pattern-action pairs)
-- @param filter a table of token types to exclude, by default {space=true}
-- @param options a table of options; by default, {number=true,string=true},
-- which means convert numbers and strip string quotes.
function lexer.scan (s,matches,filter,options)
    --assert_arg(1,s,'string')
    local file = type(s) ~= 'string' and s
    filter = filter or {space=true}
    options = options or {number=true,string=true}
    if filter then
        if filter.space then filter[wsdump] = true end
        if filter.comments then
            filter[cdump] = true
        end
    end
    if not matches then
        if not plain_matches then
            plain_matches = {
                {WSPACE,wsdump},
                {NUMBER3,ndump},
                {IDEN,plain_vdump},
                {NUMBER1,ndump},
                {NUMBER2,ndump},
                {STRING3,sdump},
                {STRING1,sdump},
                {STRING2,sdump},
                {'^.',tdump}
            }
        end
        matches = plain_matches
    end
    local i1,i2,idx,res1,res2,tok,pat,fun,capt
    local line = 1
    if file then
        s = file:read()
        if not s then return nil end -- empty file
        s = s ..'\n'
     end
    local sz = #s
    local idx = 1
    if sz == 0 then return nil end -- empty file

    local res = {}
    local mt = {}
    mt.__index = mt
    setmetatable(res,mt)

    function mt.lineno() return line end

    function mt.getline()
        if idx < sz then
            tok = strsub(s,idx,-2)
            idx = sz + 1
            line = line + 1
            return tok
        else
            idx = sz + 1
            line = line + 1
            return file:read()
        end
    end

    function mt.next (tok)
        local t,v = tok()
        while t == 'space' do
            t,v = tok()
        end
        return t,v
    end

    function mt.__call ()
      while true do
        for _,m in ipairs(matches) do
            pat,fun = m[1],m[2]
            if fun == nil then error("no match for "..pat) end
            i1,i2 = strfind(s,pat,idx)
            if i1 then
                tok = strsub(s,i1,i2)
                idx = i2 + 1
                if not (filter and filter[fun]) then
                    lexer.finished = idx > sz
                    local t,v = fun(tok,options)
                    if not file and multiline[t] then
                        line = count_lines(line,v)
                    end
                    return t,v
                end
            end
        end
        if idx > sz then
            if file then
                line = line + 1
                s = file:read()
                if not s then return end
                s = s .. '\n'
                idx ,sz = 1,#s
            else
                return
            end
        end
      end
    end
    return res
end

--- get everything in a stream upto a newline.
-- @param tok a token stream
-- @return a string
function lexer.getline (tok)
    return tok:getline()
end

--- get current line number. <br>
-- Only available if the input source is a file-like object.
-- @param tok a token stream
-- @return the line number and current column
function lexer.lineno (tok)
    return tok:lineno()
end

--- get the Lua keywords as a set-like table.
-- So <code>res["and"]</code> etc would be <code>true</code>.
-- @return a table
function lexer.get_keywords ()
    if not lua_keyword then
        lua_keyword = {
            ["and"] = true, ["break"] = true,  ["do"] = true,
            ["else"] = true, ["elseif"] = true, ["end"] = true,
            ["false"] = true, ["for"] = true, ["function"] = true,
            ["if"] = true, ["in"] = true,  ["local"] = true, ["nil"] = true,
            ["not"] = true, ["or"] = true, ["repeat"] = true,
            ["return"] = true, ["then"] = true, ["true"] = true,
            ["until"] = true,  ["while"] = true
        }
    end
    return lua_keyword
end


--- create a Lua token iterator from a string or file-like object.
-- Will return the token type and value.
-- @param s the string
-- @param filter a table of token types to exclude, by default {space=true,comments=true}
-- @param options a table of options; by default, {number=true,string=true},
-- which means convert numbers and strip string quotes.
function lexer.lua(s,filter,options)
    filter = filter or {space=true,comments=true}
    lexer.get_keywords()
    if not lua_matches then
        lua_matches = {
            {WSPACE,wsdump},
            {NUMBER3,ndump},
            {IDEN,lua_vdump},
            {NUMBER4,ndump},
            {NUMBER5,ndump},
            {STRING3,sdump},
            {STRING1,sdump},
            {STRING2,sdump},
            {'^%-%-%[(=*)%[.-%]%1%]',cdump},
            {'^%-%-.-\n',cdump},
            {'^%[(=*)%[.-%]%1%]',sdump_l},
            {'^==',tdump},
            {'^~=',tdump},
            {'^<=',tdump},
            {'^>=',tdump},
            {'^%.%.%.',tdump},
            {'^%.%.',tdump},
            {'^.',tdump}
        }
    end
    return lexer.scan(s,lua_matches,filter,options)
end

--- create a C/C++ token iterator from a string or file-like object.
-- Will return the token type type and value.
-- @param s the string
-- @param filter a table of token types to exclude, by default {space=true,comments=true}
-- @param options a table of options; by default, {number=true,string=true},
-- which means convert numbers and strip string quotes.
function lexer.cpp(s,filter,options)
    filter = filter or {comments=true}
    if not cpp_keyword then
        cpp_keyword = {
            ["class"] = true, ["break"] = true,  ["do"] = true, ["sizeof"] = true,
            ["else"] = true, ["continue"] = true, ["struct"] = true,
            ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,
            ["private"] = true, ["protected"] = true, ["goto"] = true,
            ["if"] = true, ["static"] = true,  ["const"] = true, ["typedef"] = true,
            ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,
            ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,
            ["double"] = true,  ["while"] = true, ["new"] = true,
            ["namespace"] = true, ["try"] = true, ["catch"] = true,
            ["switch"] = true, ["case"] = true, ["extern"] = true,
            ["return"] = true,["default"] = true,['unsigned']  = true,['signed'] = true,
            ["union"] =  true, ["volatile"] = true, ["register"] = true,["short"] = true,
        }
    end
    if not cpp_matches then
        cpp_matches = {
            {WSPACE,wsdump},
            {PREPRO,pdump},
            {NUMBER3,ndump},
            {IDEN,cpp_vdump},
            {NUMBER4,ndump},
            {NUMBER5,ndump},
            {STRING3,sdump},
            {STRING1,chdump},
            {STRING2,sdump},
            {'^//.-\n',cdump},
            {'^/%*.-%*/',cdump},
            {'^==',tdump},
            {'^!=',tdump},
            {'^<=',tdump},
            {'^>=',tdump},
            {'^->',tdump},
            {'^&&',tdump},
            {'^||',tdump},
            {'^%+%+',tdump},
            {'^%-%-',tdump},
            {'^%+=',tdump},
            {'^%-=',tdump},
            {'^%*=',tdump},
            {'^/=',tdump},
            {'^|=',tdump},
            {'^%^=',tdump},
            {'^::',tdump},
            {'^.',tdump}
        }
    end
    return lexer.scan(s,cpp_matches,filter,options)
end

--- get a list of parameters separated by a delimiter from a stream.
-- @param tok the token stream
-- @param endtoken end of list (default ')'). Can be '\n'
-- @param delim separator (default ',')
-- @return a list of token lists.
function lexer.get_separated_list(tok,endtoken,delim)
    endtoken = endtoken or ')'
    delim = delim or ','
    local parm_values = {}
    local level = 1 -- used to count ( and )
    local tl = {}
    local function tappend (tl,t,val)
        val = val or t
        append(tl,{t,val})
    end
    local is_end
    if endtoken == '\n' then
        is_end = function(t,val)
            return t == 'space' and val:find '\n'
        end
    else
        is_end = function (t)
            return t == endtoken
        end
    end
    local is_delim
    if type(delim) == 'function' then
        is_delim = delim
    else
        is_delim = function(t)
            return t == delim
        end
    end
    local token,value
    while true do
        token,value=tok()
        if not token then return nil,'EOS' end -- end of stream is an error!
        if is_end(token,value) and level == 1 then
            append(parm_values,tl)
            break
        elseif token == '(' then
            level = level + 1
            tappend(tl,'(')
        elseif token == ')' then
            level = level - 1
            if level == 0 then -- finished with parm list
                append(parm_values,tl)
                break
            else
                tappend(tl,')')
            end
        elseif level == 1 and is_delim(token) then
            append(parm_values,tl) -- a new parm
            tl = {}
        else
            tappend(tl,token,value)
        end
    end
    return parm_values,{token,value}
end

--- get the next non-space token from the stream.
-- @param tok the token stream.
function lexer.skipws (tok)
    return tok:next()
end

local skipws = lexer.skipws

--- get the next token, which must be of the expected type.
-- Throws an error if this type does not match!
-- @param tok the token stream
-- @param expected_type the token type
-- @param no_skip_ws whether we should skip whitespace
function lexer.expecting (tok,expected_type,no_skip_ws)
    assert_arg(1,tok,'function')
    assert_arg(2,expected_type,'string')
    local t,v
    if no_skip_ws then
        t,v = tok()
    else
        t,v = skipws(tok)
    end
    if t ~= expected_type then error ("expecting "..expected_type,2) end
    return v
end

return lexer
initial commit 2011-04-12 19:07:47 +02:00			`--- Lexical scanner for creating a sequence of tokens from text. <br>`
			`-- <p><code>lexer.scan(s)</code> returns an iterator over all tokens found in the`
			`-- string <code>s</code>. This iterator returns two values, a token type string`
			`-- (such as 'string' for quoted string, 'iden' for identifier) and the value of the`
			`-- token.`
			`-- <p>`
			`-- Versions specialized for Lua and C are available; these also handle block comments`
			`-- and classify keywords as 'keyword' tokens. For example:`
			`-- <pre class=example>`
			`-- > s = 'for i=1,n do'`
			`-- > for t,v in lexer.lua(s) do print(t,v) end`
			`-- keyword for`
			`-- iden i`
			`-- = =`
			`-- number 1`
			`-- , ,`
			`-- iden n`
			`-- keyword do`
			`-- </pre>`
			`-- See the Guide for further <a href="../../index.html#lexer">discussion</a> <br>`
			`-- @class module`
			`-- @name pl.lexer`

			`local strfind = string.find`
			`local strsub = string.sub`
			`local append = table.insert`
			`--[[`
			`module ('pl.lexer',utils._module)`
			`]]`

			`local function assert_arg(idx,val,tp)`
			`if type(val) ~= tp then`
			`error("argument "..idx.." must be "..tp, 2)`
			`end`
			`end`

			`local lexer = {}`

			`local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+'`
			`local NUMBER2 = '^[%+%-]?%d+%.?%d*'`
			`local NUMBER3 = '^0x[%da-fA-F]+'`
			`local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+'`
			`local NUMBER5 = '^%d+%.?%d*'`
			`local IDEN = '^[%a_][%w_]*'`
			`local WSPACE = '^%s+'`
handling of strings was borked 2011-07-08 16:00:21 +02:00			`local STRING1 = [[^'.-[^\\]']]`
			`local STRING2 = [[^".-[^\\]"]]`
			`local STRING3 = "^((['\"])%2)" -- empty string`
initial commit 2011-04-12 19:07:47 +02:00			`local PREPRO = '^#.-[^\\]\n'`

			`local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword`

			`local function tdump(tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return tok,tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function ndump(tok,options)`
			`if options and options.number then`
			`tok = tonumber(tok)`
			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "number",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`-- regular strings, single or double quotes; usually we want them`
			`-- without the quotes`
			`local function sdump(tok,options)`
			`if options and options.string then`
			`tok = tok:sub(2,-2)`
			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "string",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`-- long Lua strings need extra work to get rid of the quotes`
			`local function sdump_l(tok,options)`
			`if options and options.string then`
			`tok = tok:sub(3,-3)`
			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "string",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function chdump(tok,options)`
			`if options and options.string then`
			`tok = tok:sub(2,-2)`
			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "char",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function cdump(tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return 'comment',tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function wsdump (tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "space",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function pdump (tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return 'prepro',tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function plain_vdump(tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "iden",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local function lua_vdump(tok)`
			`if lua_keyword[tok] then`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "keyword",tok`
initial commit 2011-04-12 19:07:47 +02:00			`else`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "iden",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`
			`end`

			`local function cpp_vdump(tok)`
			`if cpp_keyword[tok] then`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "keyword",tok`
initial commit 2011-04-12 19:07:47 +02:00			`else`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return "iden",tok`
initial commit 2011-04-12 19:07:47 +02:00			`end`
			`end`

track line numbers in full text mode by counting linefeeds in spaces and comments 2011-08-21 17:23:58 +02:00			`local function count_lines(line, text)`
			`local index, limit = 1, #text`
			`while index <= limit do`
			`local start, stop = text:find('\r\n', index, true)`
			`if not start then`
			`start, stop = text:find('[\r\n\f]', index)`
			`if not start then break end`
			`end`
			`index = stop + 1`
			`line = line + 1`
			`end`
			`return line`
			`end`

			`local multiline = { comment = true, space = true }`


initial commit 2011-04-12 19:07:47 +02:00			`--- create a plain token iterator from a string or file-like object.`
			`-- @param s the string`
			`-- @param matches an optional match table (set of pattern-action pairs)`
			`-- @param filter a table of token types to exclude, by default {space=true}`
			`-- @param options a table of options; by default, {number=true,string=true},`
			`-- which means convert numbers and strip string quotes.`
			`function lexer.scan (s,matches,filter,options)`
			`--assert_arg(1,s,'string')`
			`local file = type(s) ~= 'string' and s`
			`filter = filter or {space=true}`
			`options = options or {number=true,string=true}`
			`if filter then`
			`if filter.space then filter[wsdump] = true end`
			`if filter.comments then`
			`filter[cdump] = true`
			`end`
			`end`
			`if not matches then`
			`if not plain_matches then`
			`plain_matches = {`
			`{WSPACE,wsdump},`
			`{NUMBER3,ndump},`
			`{IDEN,plain_vdump},`
			`{NUMBER1,ndump},`
			`{NUMBER2,ndump},`
			`{STRING3,sdump},`
			`{STRING1,sdump},`
			`{STRING2,sdump},`
			`{'^.',tdump}`
			`}`
			`end`
			`matches = plain_matches`
			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`local i1,i2,idx,res1,res2,tok,pat,fun,capt`
			`local line = 1`
			`if file then`
			`s = file:read()`
			`if not s then return nil end -- empty file`
			`s = s ..'\n'`
			`end`
			`local sz = #s`
			`local idx = 1`
			`if sz == 0 then return nil end -- empty file`

			`local res = {}`
			`local mt = {}`
			`mt.__index = mt`
			`setmetatable(res,mt)`

			`function mt.lineno() return line end`

			`function mt.getline()`
			`if idx < sz then`
			`tok = strsub(s,idx,-2)`
			`idx = sz + 1`
			`line = line + 1`
			`return tok`
			`else`
			`idx = sz + 1`
			`line = line + 1`
			`return file:read()`
initial commit 2011-04-12 19:07:47 +02:00			`end`
			`end`

lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`function mt.next (tok)`
			`local t,v = tok()`
			`while t == 'space' do`
			`t,v = tok()`
			`end`
			`return t,v`
			`end`
initial commit 2011-04-12 19:07:47 +02:00
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`function mt.__call ()`
			`while true do`
			`for _,m in ipairs(matches) do`
			`pat,fun = m[1],m[2]`
			`if fun == nil then error("no match for "..pat) end`
			`i1,i2 = strfind(s,pat,idx)`
			`if i1 then`
			`tok = strsub(s,i1,i2)`
			`idx = i2 + 1`
			`if not (filter and filter[fun]) then`
			`lexer.finished = idx > sz`
track line numbers in full text mode by counting linefeeds in spaces and comments 2011-08-21 17:23:58 +02:00			`local t,v = fun(tok,options)`
			`if not file and multiline[t] then`
			`line = count_lines(line,v)`
			`end`
			`return t,v`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`end`
			`end`
initial commit 2011-04-12 19:07:47 +02:00			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`if idx > sz then`
			`if file then`
			`line = line + 1`
			`s = file:read()`
			`if not s then return end`
			`s = s .. '\n'`
			`idx ,sz = 1,#s`
			`else`
			`return`
			`end`
			`end`
			`end`
initial commit 2011-04-12 19:07:47 +02:00			`end`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return res`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`--- get everything in a stream upto a newline.`
			`-- @param tok a token stream`
			`-- @return a string`
			`function lexer.getline (tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return tok:getline()`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`--- get current line number. <br>`
			`-- Only available if the input source is a file-like object.`
			`-- @param tok a token stream`
			`-- @return the line number and current column`
			`function lexer.lineno (tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return tok:lineno()`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`--- get the Lua keywords as a set-like table.`
			`-- So <code>res["and"]</code> etc would be <code>true</code>.`
			`-- @return a table`
			`function lexer.get_keywords ()`
			`if not lua_keyword then`
			`lua_keyword = {`
			`["and"] = true, ["break"] = true, ["do"] = true,`
			`["else"] = true, ["elseif"] = true, ["end"] = true,`
			`["false"] = true, ["for"] = true, ["function"] = true,`
			`["if"] = true, ["in"] = true, ["local"] = true, ["nil"] = true,`
			`["not"] = true, ["or"] = true, ["repeat"] = true,`
			`["return"] = true, ["then"] = true, ["true"] = true,`
			`["until"] = true, ["while"] = true`
			`}`
			`end`
			`return lua_keyword`
			`end`


			`--- create a Lua token iterator from a string or file-like object.`
			`-- Will return the token type and value.`
			`-- @param s the string`
			`-- @param filter a table of token types to exclude, by default {space=true,comments=true}`
			`-- @param options a table of options; by default, {number=true,string=true},`
			`-- which means convert numbers and strip string quotes.`
			`function lexer.lua(s,filter,options)`
			`filter = filter or {space=true,comments=true}`
			`lexer.get_keywords()`
			`if not lua_matches then`
			`lua_matches = {`
			`{WSPACE,wsdump},`
			`{NUMBER3,ndump},`
			`{IDEN,lua_vdump},`
			`{NUMBER4,ndump},`
			`{NUMBER5,ndump},`
			`{STRING3,sdump},`
			`{STRING1,sdump},`
			`{STRING2,sdump},`
handle [=[...]=] comments and strings 2011-07-14 15:23:20 +02:00			`{'^%-%-%[(=*)%[.-%]%1%]',cdump},`
initial commit 2011-04-12 19:07:47 +02:00			`{'^%-%-.-\n',cdump},`
handle [=[...]=] comments and strings 2011-07-14 15:23:20 +02:00			`{'^%[(=*)%[.-%]%1%]',sdump_l},`
initial commit 2011-04-12 19:07:47 +02:00			`{'^==',tdump},`
			`{'^~=',tdump},`
			`{'^<=',tdump},`
			`{'^>=',tdump},`
			`{'^%.%.%.',tdump},`
handling of strings was borked 2011-07-08 16:00:21 +02:00			`{'^%.%.',tdump},`
initial commit 2011-04-12 19:07:47 +02:00			`{'^.',tdump}`
			`}`
			`end`
			`return lexer.scan(s,lua_matches,filter,options)`
			`end`

			`--- create a C/C++ token iterator from a string or file-like object.`
			`-- Will return the token type type and value.`
			`-- @param s the string`
			`-- @param filter a table of token types to exclude, by default {space=true,comments=true}`
			`-- @param options a table of options; by default, {number=true,string=true},`
			`-- which means convert numbers and strip string quotes.`
			`function lexer.cpp(s,filter,options)`
			`filter = filter or {comments=true}`
			`if not cpp_keyword then`
			`cpp_keyword = {`
			`["class"] = true, ["break"] = true, ["do"] = true, ["sizeof"] = true,`
			`["else"] = true, ["continue"] = true, ["struct"] = true,`
			`["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,`
			`["private"] = true, ["protected"] = true, ["goto"] = true,`
			`["if"] = true, ["static"] = true, ["const"] = true, ["typedef"] = true,`
			`["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,`
			`["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,`
			`["double"] = true, ["while"] = true, ["new"] = true,`
			`["namespace"] = true, ["try"] = true, ["catch"] = true,`
			`["switch"] = true, ["case"] = true, ["extern"] = true,`
			`["return"] = true,["default"] = true,['unsigned'] = true,['signed'] = true,`
			`["union"] = true, ["volatile"] = true, ["register"] = true,["short"] = true,`
			`}`
			`end`
			`if not cpp_matches then`
			`cpp_matches = {`
			`{WSPACE,wsdump},`
			`{PREPRO,pdump},`
			`{NUMBER3,ndump},`
			`{IDEN,cpp_vdump},`
			`{NUMBER4,ndump},`
			`{NUMBER5,ndump},`
			`{STRING3,sdump},`
			`{STRING1,chdump},`
			`{STRING2,sdump},`
			`{'^//.-\n',cdump},`
			`{'^/%.-%/',cdump},`
			`{'^==',tdump},`
			`{'^!=',tdump},`
			`{'^<=',tdump},`
			`{'^>=',tdump},`
			`{'^->',tdump},`
			`{'^&&',tdump},`
			`{'^\|\|',tdump},`
			`{'^%+%+',tdump},`
			`{'^%-%-',tdump},`
			`{'^%+=',tdump},`
			`{'^%-=',tdump},`
			`{'^%*=',tdump},`
			`{'^/=',tdump},`
			`{'^\|=',tdump},`
			`{'^%^=',tdump},`
			`{'^::',tdump},`
			`{'^.',tdump}`
			`}`
			`end`
			`return lexer.scan(s,cpp_matches,filter,options)`
			`end`

			`--- get a list of parameters separated by a delimiter from a stream.`
			`-- @param tok the token stream`
			`-- @param endtoken end of list (default ')'). Can be '\n'`
			`-- @param delim separator (default ',')`
			`-- @return a list of token lists.`
			`function lexer.get_separated_list(tok,endtoken,delim)`
			`endtoken = endtoken or ')'`
			`delim = delim or ','`
			`local parm_values = {}`
			`local level = 1 -- used to count ( and )`
			`local tl = {}`
			`local function tappend (tl,t,val)`
			`val = val or t`
			`append(tl,{t,val})`
			`end`
			`local is_end`
			`if endtoken == '\n' then`
			`is_end = function(t,val)`
			`return t == 'space' and val:find '\n'`
			`end`
			`else`
			`is_end = function (t)`
			`return t == endtoken`
			`end`
			`end`
table fields and their descriptions can be extracted from Lua table definitions 2011-04-17 19:01:57 +02:00			`local is_delim`
			`if type(delim) == 'function' then`
			`is_delim = delim`
			`else`
			`is_delim = function(t)`
			`return t == delim`
			`end`
			`end`
initial commit 2011-04-12 19:07:47 +02:00			`local token,value`
			`while true do`
			`token,value=tok()`
			`if not token then return nil,'EOS' end -- end of stream is an error!`
			`if is_end(token,value) and level == 1 then`
			`append(parm_values,tl)`
			`break`
			`elseif token == '(' then`
			`level = level + 1`
			`tappend(tl,'(')`
			`elseif token == ')' then`
			`level = level - 1`
			`if level == 0 then -- finished with parm list`
			`append(parm_values,tl)`
			`break`
			`else`
			`tappend(tl,')')`
			`end`
table fields and their descriptions can be extracted from Lua table definitions 2011-04-17 19:01:57 +02:00			`elseif level == 1 and is_delim(token) then`
initial commit 2011-04-12 19:07:47 +02:00			`append(parm_values,tl) -- a new parm`
			`tl = {}`
			`else`
			`tappend(tl,token,value)`
			`end`
			`end`
			`return parm_values,{token,value}`
			`end`

			`--- get the next non-space token from the stream.`
			`-- @param tok the token stream.`
			`function lexer.skipws (tok)`
lexer rewritten to return an object and not a coroutine. Methods like getline() and lineno() now work much better; sorts out problems encountered with Lua block comments 2011-08-14 11:54:40 +02:00			`return tok:next()`
initial commit 2011-04-12 19:07:47 +02:00			`end`

			`local skipws = lexer.skipws`

			`--- get the next token, which must be of the expected type.`
			`-- Throws an error if this type does not match!`
			`-- @param tok the token stream`
			`-- @param expected_type the token type`
			`-- @param no_skip_ws whether we should skip whitespace`
			`function lexer.expecting (tok,expected_type,no_skip_ws)`
			`assert_arg(1,tok,'function')`
			`assert_arg(2,expected_type,'string')`
			`local t,v`
			`if no_skip_ws then`
			`t,v = tok()`
			`else`
			`t,v = skipws(tok)`
			`end`
			`if t ~= expected_type then error ("expecting "..expected_type,2) end`
			`return v`
			`end`

			`return lexer`