1 --[[--------------------------------------------------------------------
3 llex.lua: Lua 5.1 lexical analyzer in Lua
4 This file is part of LuaSrcDiet, based on Yueliang material.
6 Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
7 The COPYRIGHT file describes the conditions
8 under which this software may be distributed.
10 See the ChangeLog for more information.
12 ----------------------------------------------------------------------]]
14 --[[--------------------------------------------------------------------
16 -- * This is a version of the native 5.1.x lexer from Yueliang 0.4.0,
17 -- with significant modifications to handle LuaSrcDiet's needs:
18 -- (1) llex.error is an optional error function handler
19 -- (2) seminfo for strings include their delimiters and no
20 -- translation operations are performed on them
21 -- * ADDED shbang handling has been added to support executable scripts
22 -- * NO localized decimal point replacement magic
23 -- * NO limit to number of lines
24 -- * NO support for compatible long strings (LUA_COMPAT_LSTR)
25 -- * Please read technotes.txt for more technical details.
26 ----------------------------------------------------------------------]]
29 local string = require "string"
32 local find = string.find
33 local match = string.match
34 local sub = string.sub
36 ----------------------------------------------------------------------
37 -- initialize keyword list, variables
38 ----------------------------------------------------------------------
41 for v in string.gmatch([[
42 and break do else elseif end false for function if in
43 local nil not or repeat return then true until while]], "%S+") do
47 -- NOTE: see init() for module variables (externally visible):
48 -- tok, seminfo, tokln
50 local z, -- source stream
51 sourceid, -- name of source
52 I, -- position of lexer
53 buff, -- buffer for strings
56 ----------------------------------------------------------------------
57 -- add information to token listing
58 ----------------------------------------------------------------------
60 local function addtoken(token, info)
67 ----------------------------------------------------------------------
68 -- handles line number incrementation and end-of-line characters
69 ----------------------------------------------------------------------
71 local function inclinenumber(i, is_tok)
73 local old = sub(z, i, i)
74 i = i + 1 -- skip '\n' or '\r'
75 local c = sub(z, i, i)
76 if (c == "\n" or c == "\r") and (c ~= old) then
77 i = i + 1 -- skip '\n\r' or '\r\n'
80 if is_tok then addtoken("TK_EOL", old) end
86 ----------------------------------------------------------------------
87 -- initialize lexer for given source _z and source name _sourceid
88 ----------------------------------------------------------------------
90 function init(_z, _sourceid)
92 sourceid = _sourceid -- name of source
93 I = 1 -- lexer's position in source
95 tok = {} -- lexed token list*
96 seminfo = {} -- lexed semantic information list*
97 tokln = {} -- line numbers for messages*
98 -- (*) externally visible thru' module
99 --------------------------------------------------------------------
100 -- initial processing (shbang handling)
101 --------------------------------------------------------------------
102 local p, _, q, r = find(z, "^(#[^\r\n]*)(\r?\n?)")
103 if p then -- skip first line
105 addtoken("TK_COMMENT", q)
106 if #r > 0 then inclinenumber(I, true) end
110 ----------------------------------------------------------------------
111 -- returns a chunk name or id, no truncation for long names
112 ----------------------------------------------------------------------
115 if sourceid and match(sourceid, "^[=@]") then
116 return sub(sourceid, 2) -- remove first char
121 ----------------------------------------------------------------------
122 -- formats error message and throws error
123 -- * a simplified version, does not report what token was responsible
124 ----------------------------------------------------------------------
126 function errorline(s, line)
127 local e = error or base.error
128 e(string.format("%s:%d: %s", chunkid(), line or ln, s))
130 local errorline = errorline
132 ------------------------------------------------------------------------
133 -- count separators ("=") in a long string delimiter
134 ------------------------------------------------------------------------
136 local function skip_sep(i)
138 local s = sub(z, i, i)
140 local count = #match(z, "=*", i) -- note, take the length
143 return (sub(z, i, i) == s) and count or (-count) - 1
146 ----------------------------------------------------------------------
147 -- reads a long string or long comment
148 ----------------------------------------------------------------------
150 local function read_long_string(is_str, sep)
151 local i = I + 1 -- skip 2nd '['
153 local c = sub(z, i, i)
154 if c == "\r" or c == "\n" then -- string starts with a newline?
155 i = inclinenumber(i) -- skip it
159 local p, q, r = find(z, "([\r\n%]])", i) -- (long range)
161 errorline(is_str and "unfinished long string" or
162 "unfinished long comment")
165 if r == "]" then -- delimiter test
166 if skip_sep(i) == sep then
167 buff = sub(z, buff, I)
168 I = I + 1 -- skip 2nd ']'
179 ----------------------------------------------------------------------
181 ----------------------------------------------------------------------
183 local function read_string(del)
188 local p, q, r = find(z, "([\n\r\\\"\'])", i) -- (long range)
190 if r == "\n" or r == "\r" then
191 errorline("unfinished string")
194 if r == "\\" then -- handle escapes
197 if r == "" then break end -- (EOZ error)
198 p = find("abfnrtv\n\r", r, 1, true)
199 ------------------------------------------------------
200 if p then -- special escapes
206 ------------------------------------------------------
207 elseif find(r, "%D") then -- other non-digits
209 ------------------------------------------------------
210 else -- \xxx sequence
211 local p, q, s = find(z, "^(%d%d?%d?)", i)
213 if s + 1 > 256 then -- UCHAR_MAX
214 errorline("escape sequence too large")
216 ------------------------------------------------------
220 if r == del then -- ending delimiter
222 return sub(z, buff, i - 1) -- return string
229 errorline("unfinished string")
232 ------------------------------------------------------------------------
233 -- main lexer function
234 ------------------------------------------------------------------------
241 -- inner loop allows break to be used to nicely section tests
243 ----------------------------------------------------------------
244 local p, _, r = find(z, "^([_%a][_%w]*)", i)
248 addtoken("TK_KEYWORD", r) -- reserved word (keyword)
250 addtoken("TK_NAME", r) -- identifier
254 ----------------------------------------------------------------
255 local p, _, r = find(z, "^(%.?)%d", i)
257 if r == "." then i = i + 1 end
258 local _, q, r = find(z, "^%d*[%.%d]*([eE]?)", i)
260 if #r == 1 then -- optional exponent
261 if match(z, "^[%+%-]", i) then -- optional sign
265 local _, q = find(z, "^[_%w]*", i)
267 local v = sub(z, p, q) -- string equivalent
268 if not base.tonumber(v) then -- handles hex test also
269 errorline("malformed number")
271 addtoken("TK_NUMBER", v)
274 ----------------------------------------------------------------
275 local p, q, r, t = find(z, "^((%s)[ \t\v\f]*)", i)
277 if t == "\n" or t == "\r" then -- newline
278 inclinenumber(i, true)
280 I = q + 1 -- whitespace
281 addtoken("TK_SPACE", r)
285 ----------------------------------------------------------------
286 local r = match(z, "^%p", i)
289 local p = find("-[\"\'.=<>~", r, 1, true)
291 -- two-level if block for punctuation/symbols
292 --------------------------------------------------------
294 if p == 1 then -- minus
295 local c = match(z, "^%-%-(%[?)", i)
302 if sep >= 0 then -- long comment
303 addtoken("TK_LCOMMENT", read_long_string(false, sep))
304 else -- short comment
305 I = find(z, "[\n\r]", i) or (#z + 1)
306 addtoken("TK_COMMENT", sub(z, buff, I - 1))
310 -- (fall through for "-")
311 else -- [ or long string
312 local sep = skip_sep(i)
314 addtoken("TK_LSTRING", read_long_string(true, sep))
315 elseif sep == -1 then
316 addtoken("TK_OP", "[")
318 errorline("invalid long string delimiter")
322 --------------------------------------------------------
324 if p < 5 then -- strings
326 addtoken("TK_STRING", read_string(r))
329 r = match(z, "^%.%.?%.?", i) -- .|..|... dots
331 --------------------------------------------------------
333 r = match(z, "^%p=?", i)
338 addtoken("TK_OP", r) -- for other symbols, fall through
341 ----------------------------------------------------------------
342 local r = sub(z, i, i)
345 addtoken("TK_OP", r) -- other single-char tokens
348 addtoken("TK_EOS", "") -- end of stream,
350 ----------------------------------------------------------------
355 return base.getfenv()