1 module gamma.input.epsilang.Scanner;
2 
3 import gamma.util.Position;
4 import log;
5 import std.ascii;
6 import std.conv;
7 import std.range;
8 import std.stdio;
9 
10 class Scanner
11 {
12     private class MarkedLinePrinting : Position
13     {
14         private size_t pos;
15 
16         this(size_t pos)
17         {
18             this.pos = pos;
19         }
20 
21         public void markError(string message)
22         {
23             const lineNumber = this.outer.lineBeginPos.assumeSorted.lowerBound(this.pos + 1).length;
24             const beginPos = this.outer.lineBeginPos[lineNumber - 1];
25             const endPos = findLineSeparator(this.pos);
26 
27             if (beginPos <= this.pos)
28             {
29                 char[] quotation = this.outer.source[beginPos .. endPos];
30                 char[] mark = new char[this.pos - beginPos + 1];
31 
32                 foreach (i; 0 .. mark.length - 1)
33                     if (this.outer.source[beginPos + i] == '\t')
34                         mark[i] = '\t';
35                     else
36                         mark[i] = ' ';
37                 mark[mark.length - 1] = '^';
38                 error!"%s: %s\n%s\n%s"(lineNumber, message, quotation, mark);
39             }
40             else
41                 error!"%s: %s"(lineNumber, message);
42             ++this.outer.errorCount;
43         }
44 
45         public override bool opEquals(Object o)
46         {
47             if (!cast(MarkedLinePrinting) o)
48                 return false;
49 
50             auto that = cast(MarkedLinePrinting) o;
51 
52             return this.pos == that.pos && this.scanner == that.scanner;
53         }
54 
55         public override size_t toHash() nothrow @safe
56         {
57             return this.pos;
58         }
59 
60         private Scanner scanner()
61         {
62             return this.outer;
63         }
64     }
65 
66     static const char END = 0;
67 
68     static const char LITERAL = '"';
69 
70     static const char NUMBER = '0';
71 
72     static const char LEXICAL_VARIABLE = 'a';
73 
74     static const char SYNTACTIC_VARIABLE = 'A';
75 
76     private char[] source;
77 
78     private size_t pos = 0;
79 
80     private Position position;
81 
82     private string representation;
83 
84     private int value;
85 
86     private size_t[] lineBeginPos;
87 
88     private int errorCount = 0;
89 
90     this(File input)
91     {
92         import std.algorithm : joiner;
93         import std.array : array;
94 
95         this.source = cast(char[]) input.byChunk(4096).joiner.array;
96         this.source ~= END;
97         this.lineBeginPos ~= this.pos;
98     }
99 
100     /**
101      * @return the recognized token, coded as a single character
102      */
103     char read()
104     {
105         import std.format : format;
106 
107         char read()
108         {
109             char c;
110 
111             for (;;)
112             {
113                 c = this.source[this.pos];
114                 if (c == '\n' || c == '\r')
115                     skipLine;
116                 else if (isWhite(c))
117                     ++this.pos;
118                 else if (c == '/')
119                     if (this.source[this.pos + 1] == '/')
120                         skipLine;
121                     else if (this.source[this.pos + 1] == '*')
122                         readComment;
123                     else
124                         break;
125                 else if (c == '*')
126                     // skip unsupported lexical-variable marker
127                     ++this.pos;
128                 else
129                     break;
130             }
131             this.position = new MarkedLinePrinting(pos);
132             this.representation = null;
133             if (c == '"')
134             {
135                 this.representation = readLiteral;
136                 return LITERAL;
137             }
138             else if (isDigit(c))
139             {
140                 this.value = readNumber;
141                 return NUMBER;
142             }
143             else if (isLower(c))
144             {
145                 this.representation = readVariable;
146                 return LEXICAL_VARIABLE;
147             }
148             else if (isUpper(c))
149             {
150                 this.representation = readVariable;
151                 return SYNTACTIC_VARIABLE;
152             }
153             else if (c != END)
154             {
155                 ++this.pos;
156                 return c;
157             }
158             else
159                 return END;
160         }
161 
162         const c = read;
163 
164         if (levels & Level.trace)
165         {
166             switch (c)
167             {
168                 case END:
169                     getPosition.markError("END");
170                     break;
171                 case LITERAL:
172                     getPosition.markError(format!"LITERAL: %s"(getRepresentation));
173                     break;
174                 case NUMBER:
175                     getPosition.markError(format!"NUMBER: %s"(getValue));
176                     break;
177                 case LEXICAL_VARIABLE:
178                     getPosition.markError(format!"LEXICAL VARIABLE: %s"(getRepresentation));
179                     break;
180                 case SYNTACTIC_VARIABLE:
181                     getPosition.markError(format!"SYNTACTIC VARIABLE: %s"(getRepresentation));
182                     break;
183                 default:
184                     getPosition.markError(format!"CHAR: %s"(c));
185                     break;
186             }
187             --this.errorCount;
188         }
189         return c;
190     }
191 
192     private void readComment()
193     in (this.source[this.pos] == '/' && this.source[this.pos + 1] == '*')
194     {
195         const pos = this.pos;
196         int level = 1;
197         char c1 = ' ';
198         char c2;
199 
200         ++this.pos;
201         for (;;)
202         {
203             c2 = this.source[++this.pos];
204             if (c1 == '/' && c2 == '*')
205             {
206                 ++this.pos;
207                 ++level;
208                 c1 = this.source[this.pos];
209             }
210             else if (c1 == '*' && c2 == '/')
211             {
212                 ++this.pos;
213                 if (--level == 0)
214                     return;
215                 c1 = this.source[this.pos];
216             }
217             else
218                 c1 = c2;
219             while (c1 == '\n' || c1 == '\r')
220             {
221                 skipLine;
222                 c1 = this.source[this.pos];
223             }
224             if (c1 == END)
225             {
226                 Position position = new MarkedLinePrinting(pos);
227 
228                 position.markError("comment not terminated at end of input");
229                 return;
230             }
231         }
232     }
233 
234     private char readEscapeSequence()
235     in (this.source[this.pos] == '\\')
236     {
237         size_t pos = this.pos;
238         char c = this.source[++this.pos];
239         int value = 0;
240 
241         switch (c)
242         {
243         case 'b':
244             c = '\b';
245             break;
246         case 't':
247             c = '\t';
248             break;
249         case 'n':
250             c = '\n';
251             break;
252         case 'f':
253             c = '\f';
254             break;
255         case 'r':
256             c = '\r';
257             break;
258         case '"':
259             c = '"';
260             break;
261         case '\'':
262             c = '\'';
263             break;
264         case '\\':
265             c = '\\';
266             break;
267         case '0':
268         case '1':
269         case '2':
270         case '3':
271         case '4':
272         case '5':
273         case '6':
274         case '7':
275             while (isOctalDigit(this.source[this.pos]))
276                 ++this.pos;
277             // TODO: catch ConvException
278             value = this.source[pos + 1 .. this.pos].to!int(8);
279             if (value > 0xff)
280             {
281                 Position position = new MarkedLinePrinting(pos);
282 
283                 position.markError("octal character constant out of range");
284                 return 0;
285             }
286             return cast(char) value;
287         case 'u':
288             for (int i = 0; i < 4; ++i)
289             {
290                 if (!isHexDigit(this.source[++this.pos]))
291                 {
292                     Position position = new MarkedLinePrinting(this.pos);
293 
294                     position.markError("hexadecimal digit expected");
295                     return 0;
296                 }
297             }
298             // TODO: catch ConvException
299             value = this.source[pos + 2 .. this.pos + 1].to!int(16);
300             c = cast(char) value;
301             break;
302         default:
303             if (c != '\n' && c != '\r' && c != END)
304             {
305                 Position position = new MarkedLinePrinting(this.pos);
306 
307                 position.markError("illegal escape character");
308             }
309             return 0;
310         }
311         ++this.pos;
312         return c;
313     }
314 
315     private string readLiteral()
316     in (this.source[this.pos] == '"')
317     {
318         string representation;
319         size_t pos = this.pos + 1;
320         char c;
321 
322         do
323         {
324             c = this.source[++this.pos];
325         }
326         while (c != '\\' && c != '"' && c != '\n' && c != '\r' && c != END);
327         if (c == '\\')
328         {
329             char[] buffer;
330 
331             buffer ~= this.source[pos .. this.pos];
332             do
333             {
334                 if (c == '\\')
335                 {
336                     buffer ~= readEscapeSequence;
337                     c = this.source[this.pos];
338                 }
339                 else
340                 {
341                     buffer ~= c;
342                     c = this.source[++this.pos];
343                 }
344             }
345             while (c != '"' && c != '\n' && c != '\r' && c != END);
346             representation = buffer.dup;
347         }
348         else
349         {
350             if (c == '"' && this.pos == pos)
351                 this.position.markError("illegal empty string");
352             representation = this.source[pos .. this.pos].dup;
353         }
354         if (c != '"')
355         {
356             this.position.markError("string not terminated at end of line");
357             representation = "";
358         }
359         else
360             ++this.pos;
361         return representation;
362     }
363 
364     private int readNumber()
365     in (isDigit(this.source[this.pos]))
366     {
367         size_t pos = this.pos;
368         int value = 0;
369 
370         while (isDigit(this.source[this.pos]))
371             ++this.pos;
372         value = this.source[pos .. this.pos].to!int;
373         // TODO: catch ConvException
374         if (value > 9999)
375         {
376             position.markError("number out of range [0, 9999]");
377             value = 0;
378         }
379         return value;
380     }
381 
382     private string readVariable()
383     in (isAlpha(this.source[this.pos]))
384     {
385         size_t pos = this.pos;
386 
387         while (isAlpha(this.source[this.pos]))
388             ++this.pos;
389         return this.source[pos .. this.pos].dup;
390     }
391 
392     private void skipLine()
393     {
394         this.pos = findLineSeparator(this.pos);
395         if (this.source[this.pos] != END)
396         {
397             if (this.source[this.pos] == '\r' && this.source[this.pos + 1] == '\n')
398                 ++this.pos;
399             ++this.pos;
400             this.lineBeginPos ~= this.pos;
401         }
402     }
403 
404     private size_t findLineSeparator(size_t pos)
405     {
406         char c = this.source[pos];
407 
408         while (c != '\n' && c != '\r' && c != END)
409             c = this.source[++pos];
410         return pos;
411     }
412 
413     Position getPosition()
414     {
415         return this.position;
416     }
417 
418     string getRepresentation() const
419     {
420         return this.representation;
421     }
422 
423     int getValue() const
424     {
425         return this.value;
426     }
427 
428     int getErrorCount() const
429     {
430         return this.errorCount;
431     }
432 }