1 module epsilon.lexer;
2 
3 import io : Input, Position;
4 import std.uni;
5 import symbols : SymbolTable;
6 
7 struct Lexer
8 {
9     private Input input;
10 
11     private SymbolTable symbolTable;
12 
13     invariant (symbolTable !is null);
14 
15     private dchar token;
16 
17     private size_t value_;
18 
19     private Position position_;
20 
21     private size_t errorCount = 0;
22 
23     this(Input input, SymbolTable symbolTable)
24     in (symbolTable !is null)
25     {
26         this.input = input;
27         this.symbolTable = symbolTable;
28         readToken;
29     }
30 
31     bool empty() const @nogc nothrow pure @safe
32     {
33         return token == Token.end;
34     }
35 
36     dchar front() const @nogc nothrow pure @safe
37     {
38         return token;
39     }
40 
41     void popFront()
42     {
43         readToken;
44     }
45 
46     private void readToken()
47     {
48         scope (exit)
49             addTrace;
50         while (true)
51         {
52             while (input.next.isWhite)
53                 input.popFront;
54             position_ = input.position;
55             if (input.empty)
56             {
57                 token = Token.end;
58                 return;
59             }
60             if (input.front == '/')
61             {
62                 token = input.front;
63                 input.popFront;
64                 if (input.next == '/')
65                     readLineComment;
66                 else if (input.next == '*')
67                     readBlockComment;
68                 else
69                     return;
70             }
71             else
72             {
73                 break;
74             }
75 
76         }
77         if (input.front == '"' || input.front == '\'')
78         {
79             token = Token.string_;
80             readString;
81         }
82         else if (input.front == '`')
83         {
84             token = Token.string_;
85             readRawString;
86         }
87         else if (input.front.isAlpha || input.front == '_')
88         {
89             token = Token.name;
90             readName;
91         }
92         else if (input.front.isNumber)
93         {
94             token = Token.number;
95             readNumber;
96         }
97         else
98         {
99             token = input.front;
100             input.popFront;
101         }
102     }
103 
104     @("read empty input")
105     unittest
106     {
107         with (fixture(null))
108         {
109             assert(lexer.empty);
110             assert(lexer.front == Token.end);
111             assert(lexer.ok);
112         }
113     }
114 
115     @("read special character")
116     unittest
117     {
118         with (fixture(":"))
119         {
120             assert(lexer.front == ':');
121             assert(lexer.ok);
122         }
123     }
124 
125     @("read variable")
126     unittest
127     {
128         with (fixture("foo 42"))
129         {
130             assert(lexer.front == Token.name);
131             assert(symbolTable.symbol(lexer.value) == "foo");
132 
133             lexer.popFront;
134 
135             assert(lexer.front == Token.number);
136             assert(symbolTable.symbol(lexer.value) == "42");
137             assert(lexer.ok);
138         }
139     }
140 
141     private void readLineComment()
142     in (input.next == '/')
143     {
144         do
145             input.popFront;
146         while (!input.empty && input.front != '\n');
147     }
148 
149     @("read line comment")
150     unittest
151     {
152         with (fixture("//\n"))
153         {
154             assert(lexer.empty);
155             assert(lexer.ok);
156         }
157     }
158 
159     private void readBlockComment()
160     in (input.next == '*')
161     {
162         size_t level = 1;
163         dchar prev = 0;
164 
165         while (true)
166         {
167             input.popFront;
168             if (prev == '/' && input.next == '*')
169             {
170                 input.popFront;
171                 ++level;
172             }
173             else if (prev == '*' && input.next == '/')
174             {
175                 input.popFront;
176                 --level;
177                 if (level == 0)
178                     break;
179             }
180             if (input.empty)
181             {
182                 addError("comment not closed at end of input");
183                 break;
184             }
185             prev = input.front;
186         }
187     }
188 
189     @("read block comment")
190     unittest
191     {
192         with (fixture("/**/"))
193         {
194             assert(lexer.empty);
195             assert(lexer.ok);
196         }
197     }
198 
199     @("read nested block comment")
200     unittest
201     {
202         with (fixture("/*/**/*/"))
203         {
204             assert(lexer.empty);
205             assert(lexer.ok);
206         }
207     }
208 
209     @("read block comment with line comment")
210     unittest
211     {
212         with (fixture("/* // */"))
213         {
214             assert(lexer.empty);
215             assert(lexer.ok);
216         }
217     }
218 
219     @("read invalid block comment")
220     unittest
221     {
222         with (fixture("/*/*/"))
223         {
224             assert(lexer.empty);
225             assert(!lexer.ok);
226         }
227     }
228 
229     /**
230      * string:
231      *     "'" { character | '\' character } "'"
232      *   | '"' { character | '\' character } '"'.
233      */
234     private void readString()
235     in (input.next == '"' || input.next == '\'')
236     {
237         const quote = input.next;
238         const begin = input.index;
239 
240         scope (exit)
241             value_ = symbolTable.intern(input.sliceFrom(begin));
242         do
243         {
244             input.popFront;
245             if (input.next == '\\')
246             {
247                 input.popFront;
248                 if (!input.empty && input.front != '\n')
249                     input.popFront;
250             }
251             if (input.empty || input.front == '\n')
252             {
253                 addError("string not closed at end of line");
254                 return;
255             }
256         }
257         while (input.front != quote);
258         input.popFront;
259     }
260 
261     @("read single-quoted string")
262     unittest
263     {
264         with (fixture("'foo'"))
265         {
266             assert(lexer.front == Token.string_);
267             assert(symbolTable.symbol(lexer.value) == "'foo'");
268             assert(lexer.ok);
269         }
270     }
271 
272     @("read double-quoted string")
273     unittest
274     {
275         with (fixture(`"foo"`))
276         {
277             assert(lexer.front == Token.string_);
278             assert(symbolTable.symbol(lexer.value) == `"foo"`);
279             assert(lexer.ok);
280         }
281     }
282 
283     @("read empty string")
284     unittest
285     {
286         with (fixture(`''`))
287         {
288             assert(lexer.front == Token.string_);
289             assert(symbolTable.symbol(lexer.value) == `''`);
290             assert(lexer.ok);
291         }
292     }
293 
294     @("read string with escape sequence")
295     unittest
296     {
297         with (fixture(`'\''`))
298         {
299             assert(lexer.front == Token.string_);
300             assert(symbolTable.symbol(lexer.value) == `'\''`);
301             assert(lexer.ok);
302         }
303     }
304 
305     @("read invalid string")
306     unittest
307     {
308         with (fixture(`'foo`))
309         {
310             assert(lexer.front == Token.string_);
311             assert(symbolTable.symbol(lexer.value) == `'foo`);
312             assert(!lexer.ok);
313         }
314     }
315 
316     /**
317      * string: "`" { character } "`".
318      */
319     private void readRawString()
320     in (input.next == '`')
321     {
322         const begin = input.index;
323 
324         scope (exit)
325             value_ = symbolTable.intern(input.sliceFrom(begin));
326         do
327         {
328             input.popFront;
329             if (input.empty || input.front == '\n')
330             {
331                 addError("string not closed at end of line");
332                 return;
333             }
334         }
335         while (input.front != '`');
336         input.popFront;
337     }
338 
339     @("read raw string")
340     unittest
341     {
342         with (fixture("`\\`"))
343         {
344             assert(lexer.front == Token.string_);
345             assert(symbolTable.symbol(lexer.value) == "`\\`");
346             assert(lexer.ok);
347         }
348     }
349 
350     /**
351      * name: ( letter | "_") { letter | "_" }.
352      */
353     private void readName()
354     in (input.next.isAlpha || input.next == '_')
355     {
356         const begin = input.index;
357 
358         scope (exit)
359             value_ = symbolTable.intern(input.sliceFrom(begin));
360         do
361             input.popFront;
362         while (input.next.isAlpha || input.next == '_');
363     }
364 
365     @("read name")
366     unittest
367     {
368         with (fixture("foo"))
369         {
370             assert(lexer.front == Token.name);
371             assert(symbolTable.symbol(lexer.value) == "foo");
372             assert(lexer.ok);
373         }
374     }
375 
376     @("read reserved name")
377     unittest
378     {
379         with (fixture("_foo"))
380         {
381             assert(lexer.front == Token.name);
382             assert(symbolTable.symbol(lexer.value) == "_foo");
383             assert(lexer.ok);
384         }
385     }
386 
387     @("read name with umlauts")
388     unittest
389     {
390         with (fixture("äöü"))
391         {
392             assert(lexer.front == Token.name);
393             assert(symbolTable.symbol(lexer.value) == "äöü");
394             assert(lexer.ok);
395         }
396     }
397 
398     /**
399      * number: digit { digit }.
400      */
401     private void readNumber()
402     in (input.next.isNumber)
403     {
404         const begin = input.index;
405 
406         scope (exit)
407             value_ = symbolTable.intern(input.sliceFrom(begin));
408         do
409             input.popFront;
410         while (input.next.isNumber);
411     }
412 
413     @("read number")
414     unittest
415     {
416         with (fixture("42"))
417         {
418             assert(lexer.front == Token.number);
419             assert(symbolTable.symbol(lexer.value) == "42");
420             assert(lexer.ok);
421         }
422     }
423 
424     private void addTrace()
425     {
426         import log : trace;
427 
428         switch (token) with (Token)
429         {
430             case end:
431                 trace!"end\n%s"(position_);
432                 break;
433             case string_:
434                 trace!"string: %s\n%s"(symbolTable.symbol(value_), position_);
435                 break;
436             case name:
437                 trace!"name: %s\n%s"(symbolTable.symbol(value_), position_);
438                 break;
439             case number:
440                 trace!"number: %s\n%s"(symbolTable.symbol(value_), position_);
441                 break;
442             default:
443                 trace!"%s\n%s"(token, position_);
444                 break;
445         }
446     }
447 
448     private void addError(string message)
449     {
450         import log : error;
451 
452         ++errorCount;
453         error!"%s\n%s"(message, position_);
454     }
455 
456     bool ok() const @nogc nothrow pure @safe
457     {
458         return errorCount == 0;
459     }
460 
461     size_t value() const @nogc nothrow pure @safe
462     {
463         return value_;
464     }
465 
466     Position position() const @nogc nothrow pure @safe
467     {
468         return position_;
469     }
470 }
471 
472 private dchar next(ref Input input)
473 {
474     return input.empty ? 0 : input.front;
475 }
476 
477 enum Token : dchar
478 {
479     end = 0,
480     string_ = '"',
481     name = 'A',
482     number = '0',
483 }
484 
485 version (unittest)
486 {
487     private auto fixture(string text)
488     {
489         struct Fixture
490         {
491             Lexer lexer;
492 
493             SymbolTable symbolTable;
494         }
495 
496         auto symbolTable = new SymbolTable;
497 
498         return Fixture(Lexer(Input("name", text), symbolTable), symbolTable);
499     }
500 }