Module:Documentation/Lexer
Jump to navigation
Jump to search
-- MIT License
--
-- Copyright (c) 2018 LoganDark
--
-- Permission is hereby granted, free of charge, to any person obtaining a copy
-- of this software and associated documentation files (the "Software"), to deal
-- in the Software without restriction, including without limitation the rights
-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-- copies of the Software, and to permit persons to whom the Software is
-- furnished to do so, subject to the following conditions:
--
-- The above copyright notice and this permission notice shall be included in all
-- copies or substantial portions of the Software.
--
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-- SOFTWARE.
function lookupify(src, list)
list = list or {}
if type(src) == 'string' then
for i = 1, src:len() do
list[src:sub(i, i)] = true
end
elseif type(src) == 'table' then
for i = 1, #src do
list[src[i]] = true
end
end
return list
end
local base_ident = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
local base_digits = '0123456789'
local base_operators = '+-*/^%#'
local chars = {
whitespace = lookupify(' \n\t\r'),
validEscapes = lookupify('abfnrtv"\'\\'),
ident = lookupify(
base_ident .. base_digits,
{
start = lookupify(base_ident),
}
),
digits = lookupify(
base_digits,
{
hex = lookupify(base_digits .. 'abcdefABCDEF')
}
),
symbols = lookupify(
base_operators .. ',{}[]();.:', {
equality = lookupify('~=><'),
operators = lookupify(base_operators)
}
)
}
local keywords = {
structure = lookupify({
'and', 'break', 'do', 'else', 'elseif', 'end', 'for', 'function',
'goto', 'if', 'in', 'local', 'not', 'or', 'repeat', 'return', 'then',
'until', 'while'
}),
values = lookupify({
'true', 'false', 'nil'
})
}
return function(text)
local pos = 1
local start = 1
local buffer = {}
local lines = {}
local function look(delta)
delta = pos + (delta or 0)
return text:sub(delta, delta)
end
local function get()
pos = pos + 1
return look(-1)
end
local function getDataLevel()
local num = 0
while look(num) == '=' do
num = num + 1
end
if look(num) == '[' then
pos = pos + num + 1
return num
end
end
local function getCurrentTokenText()
return text:sub(start, pos - 1)
end
local currentLineLength = 0
local lineoffset = 0
local function pushToken(type, text)
text = text or getCurrentTokenText()
local tk = buffer[#buffer]
if not tk or tk.type ~= type then
tk = {
type = type,
data = text,
posFirst = start - lineoffset,
posLast = pos - 1 - lineoffset
}
if tk.data ~= '' then
buffer[#buffer + 1] = tk
end
else
tk.data = tk.data .. text
tk.posLast = tk.posLast + text:len()
end
currentLineLength = currentLineLength + text:len()
start = pos
return tk
end
local function newline()
lines[#lines + 1] = buffer
buffer = {}
get()
pushToken('newline')
buffer[1] = nil
lineoffset = lineoffset + currentLineLength
currentLineLength = 0
end
local function getData(level, type)
while true do
local char = get()
if char == '' then
return
elseif char == '\n' then
pos = pos - 1
pushToken(type)
newline()
elseif char == ']' then
local valid = true
for i = 1, level do
if look() == '=' then
pos = pos + 1
else
valid = false
break
end
end
if valid and look() == ']' then
pos = pos - level - 1
return
end
end
end
end
local function chompWhitespace()
while true do
local char = look()
if char == '\n' then
pushToken('whitespace')
newline()
elseif chars.whitespace[char] then
pos = pos + 1
else
break
end
end
pushToken('whitespace')
end
while true do
chompWhitespace()
local char = get()
if char == '' then
break
elseif char == '-' and look() == '-' then
pos = pos + 1
if look() == '[' then
pos = pos + 1
local level = getDataLevel()
if level then
getData(level, 'comment')
pos = pos + level + 2
pushToken('comment')
else
while true do
local char2 = get()
if char2 == '' or char2 == '\n' then
pos = pos - 1
pushToken('comment')
if char2 == '\n' then
newline()
end
break
end
end
end
else
while true do
local char2 = get()
if char2 == '' or char2 == '\n' then
pos = pos - 1
pushToken('comment')
if char2 == '\n' then
newline()
end
break
end
end
end
pushToken('comment')
elseif char == '\'' or char == '"' then
pushToken('string_start')
while true do
local char2 = get()
if char2 == '\\' then
pos = pos - 1
pushToken('string')
get()
local char3 = get()
if chars.digits[char3] then
for i = 1, 2 do
if chars.digits[look()] then
pos = pos + 1
end
end
elseif char3 == 'x' then
if chars.digits.hex[look()] and chars.digits.hex[look(1)] then
pos = pos + 2
else
pushToken('unidentified')
end
elseif char3 == '\n' then
pos = pos - 1
pushToken('escape')
newline()
elseif not chars.validEscapes[char3] then
pushToken('unidentified')
end
pushToken('escape')
elseif char2 == '\n' then
pos = pos - 1
pushToken('string')
newline()
break
elseif char2 == char or char2 == '' then
pos = pos - 1
pushToken('string')
get()
break
end
end
pushToken('string_end')
elseif chars.ident.start[char] then
while chars.ident[look()] do
pos = pos + 1
end
local word = getCurrentTokenText()
if keywords.structure[word] then
pushToken('keyword')
elseif keywords.values[word] then
pushToken('value')
else
pushToken('ident')
end
elseif chars.digits[char] or (char == '.' and chars.digits[look()]) then
if char == '0' and look() == 'x' then
pos = pos + 1
while chars.digits.hex[look()] do
pos = pos + 1
end
else
while chars.digits[look()] do
pos = pos + 1
end
if look() == '.' then
pos = pos + 1
while chars.digits[look()] do
pos = pos + 1
end
end
if look():lower() == 'e' then
pos = pos + 1
if look() == '-' then
pos = pos + 1
end
while chars.digits[look()] do
pos = pos + 1
end
end
end
pushToken('number')
elseif char == '[' then
local level = getDataLevel()
if level then
pushToken('string_start')
getData(level, 'string')
pushToken('string')
pos = pos + level + 2
pushToken('string_end')
else
pushToken('symbol')
end
elseif char == '.' then
if look() == '.' then
pos = pos + 1
if look() == '.' then
pos = pos + 1
end
end
if getCurrentTokenText():len() == 3 then
pushToken('vararg')
else
pushToken('symbol')
end
elseif char == ':' and look() == ':' then
get()
pushToken('label_start')
chompWhitespace()
if chars.ident.start[look()] then
get()
while chars.ident[look()] do
get()
end
pushToken('label')
chompWhitespace()
if look() == ':' and look(1) == ':' then
get()
get()
pushToken('label_end')
end
end
elseif chars.symbols.equality[char] then
if look() == '=' then
pos = pos + 1
end
pushToken('operator')
elseif chars.symbols[char] then
if chars.symbols.operators[char] then
pushToken('operator')
else
pushToken('symbol')
end
else
pushToken('unidentified')
end
end
lines[#lines + 1] = buffer
return lines
end