1 module epsilon.lexer; 2 3 import io : Input, Position; 4 import std.uni; 5 import symbols : SymbolTable; 6 7 struct Lexer 8 { 9 private Input input; 10 11 private SymbolTable symbolTable; 12 13 invariant (symbolTable !is null); 14 15 private dchar token; 16 17 private size_t value_; 18 19 private Position position_; 20 21 private size_t errorCount = 0; 22 23 this(Input input, SymbolTable symbolTable) 24 in (symbolTable !is null) 25 { 26 this.input = input; 27 this.symbolTable = symbolTable; 28 readToken; 29 } 30 31 bool empty() const @nogc nothrow pure @safe 32 { 33 return token == Token.end; 34 } 35 36 dchar front() const @nogc nothrow pure @safe 37 { 38 return token; 39 } 40 41 void popFront() 42 { 43 readToken; 44 } 45 46 private void readToken() 47 { 48 scope (exit) 49 addTrace; 50 while (true) 51 { 52 while (input.next.isWhite) 53 input.popFront; 54 position_ = input.position; 55 if (input.empty) 56 { 57 token = Token.end; 58 return; 59 } 60 if (input.front == '/') 61 { 62 token = input.front; 63 input.popFront; 64 if (input.next == '/') 65 readLineComment; 66 else if (input.next == '*') 67 readBlockComment; 68 else 69 return; 70 } 71 else 72 { 73 break; 74 } 75 76 } 77 if (input.front == '"' || input.front == '\'') 78 { 79 token = Token.string_; 80 readString; 81 } 82 else if (input.front == '`') 83 { 84 token = Token.string_; 85 readRawString; 86 } 87 else if (input.front.isAlpha || input.front == '_') 88 { 89 token = Token.name; 90 readName; 91 } 92 else if (input.front.isNumber) 93 { 94 token = Token.number; 95 readNumber; 96 } 97 else 98 { 99 token = input.front; 100 input.popFront; 101 } 102 } 103 104 @("read empty input") 105 unittest 106 { 107 with (fixture(null)) 108 { 109 assert(lexer.empty); 110 assert(lexer.front == Token.end); 111 assert(lexer.ok); 112 } 113 } 114 115 @("read special character") 116 unittest 117 { 118 with (fixture(":")) 119 { 120 assert(lexer.front == ':'); 121 assert(lexer.ok); 122 } 123 } 124 125 @("read variable") 126 unittest 127 { 128 with (fixture("foo 42")) 129 { 130 assert(lexer.front == Token.name); 131 assert(symbolTable.symbol(lexer.value) == "foo"); 132 133 lexer.popFront; 134 135 assert(lexer.front == Token.number); 136 assert(symbolTable.symbol(lexer.value) == "42"); 137 assert(lexer.ok); 138 } 139 } 140 141 private void readLineComment() 142 in (input.next == '/') 143 { 144 do 145 input.popFront; 146 while (!input.empty && input.front != '\n'); 147 } 148 149 @("read line comment") 150 unittest 151 { 152 with (fixture("//\n")) 153 { 154 assert(lexer.empty); 155 assert(lexer.ok); 156 } 157 } 158 159 private void readBlockComment() 160 in (input.next == '*') 161 { 162 size_t level = 1; 163 dchar prev = 0; 164 165 while (true) 166 { 167 input.popFront; 168 if (prev == '/' && input.next == '*') 169 { 170 input.popFront; 171 ++level; 172 } 173 else if (prev == '*' && input.next == '/') 174 { 175 input.popFront; 176 --level; 177 if (level == 0) 178 break; 179 } 180 if (input.empty) 181 { 182 addError("comment not closed at end of input"); 183 break; 184 } 185 prev = input.front; 186 } 187 } 188 189 @("read block comment") 190 unittest 191 { 192 with (fixture("/**/")) 193 { 194 assert(lexer.empty); 195 assert(lexer.ok); 196 } 197 } 198 199 @("read nested block comment") 200 unittest 201 { 202 with (fixture("/*/**/*/")) 203 { 204 assert(lexer.empty); 205 assert(lexer.ok); 206 } 207 } 208 209 @("read block comment with line comment") 210 unittest 211 { 212 with (fixture("/* // */")) 213 { 214 assert(lexer.empty); 215 assert(lexer.ok); 216 } 217 } 218 219 @("read invalid block comment") 220 unittest 221 { 222 with (fixture("/*/*/")) 223 { 224 assert(lexer.empty); 225 assert(!lexer.ok); 226 } 227 } 228 229 /** 230 * string: 231 * "'" { character | '\' character } "'" 232 * | '"' { character | '\' character } '"'. 233 */ 234 private void readString() 235 in (input.next == '"' || input.next == '\'') 236 { 237 const quote = input.next; 238 const begin = input.index; 239 240 scope (exit) 241 value_ = symbolTable.intern(input.sliceFrom(begin)); 242 do 243 { 244 input.popFront; 245 if (input.next == '\\') 246 { 247 input.popFront; 248 if (!input.empty && input.front != '\n') 249 input.popFront; 250 } 251 if (input.empty || input.front == '\n') 252 { 253 addError("string not closed at end of line"); 254 return; 255 } 256 } 257 while (input.front != quote); 258 input.popFront; 259 } 260 261 @("read single-quoted string") 262 unittest 263 { 264 with (fixture("'foo'")) 265 { 266 assert(lexer.front == Token.string_); 267 assert(symbolTable.symbol(lexer.value) == "'foo'"); 268 assert(lexer.ok); 269 } 270 } 271 272 @("read double-quoted string") 273 unittest 274 { 275 with (fixture(`"foo"`)) 276 { 277 assert(lexer.front == Token.string_); 278 assert(symbolTable.symbol(lexer.value) == `"foo"`); 279 assert(lexer.ok); 280 } 281 } 282 283 @("read empty string") 284 unittest 285 { 286 with (fixture(`''`)) 287 { 288 assert(lexer.front == Token.string_); 289 assert(symbolTable.symbol(lexer.value) == `''`); 290 assert(lexer.ok); 291 } 292 } 293 294 @("read string with escape sequence") 295 unittest 296 { 297 with (fixture(`'\''`)) 298 { 299 assert(lexer.front == Token.string_); 300 assert(symbolTable.symbol(lexer.value) == `'\''`); 301 assert(lexer.ok); 302 } 303 } 304 305 @("read invalid string") 306 unittest 307 { 308 with (fixture(`'foo`)) 309 { 310 assert(lexer.front == Token.string_); 311 assert(symbolTable.symbol(lexer.value) == `'foo`); 312 assert(!lexer.ok); 313 } 314 } 315 316 /** 317 * string: "`" { character } "`". 318 */ 319 private void readRawString() 320 in (input.next == '`') 321 { 322 const begin = input.index; 323 324 scope (exit) 325 value_ = symbolTable.intern(input.sliceFrom(begin)); 326 do 327 { 328 input.popFront; 329 if (input.empty || input.front == '\n') 330 { 331 addError("string not closed at end of line"); 332 return; 333 } 334 } 335 while (input.front != '`'); 336 input.popFront; 337 } 338 339 @("read raw string") 340 unittest 341 { 342 with (fixture("`\\`")) 343 { 344 assert(lexer.front == Token.string_); 345 assert(symbolTable.symbol(lexer.value) == "`\\`"); 346 assert(lexer.ok); 347 } 348 } 349 350 /** 351 * name: ( letter | "_") { letter | "_" }. 352 */ 353 private void readName() 354 in (input.next.isAlpha || input.next == '_') 355 { 356 const begin = input.index; 357 358 scope (exit) 359 value_ = symbolTable.intern(input.sliceFrom(begin)); 360 do 361 input.popFront; 362 while (input.next.isAlpha || input.next == '_'); 363 } 364 365 @("read name") 366 unittest 367 { 368 with (fixture("foo")) 369 { 370 assert(lexer.front == Token.name); 371 assert(symbolTable.symbol(lexer.value) == "foo"); 372 assert(lexer.ok); 373 } 374 } 375 376 @("read reserved name") 377 unittest 378 { 379 with (fixture("_foo")) 380 { 381 assert(lexer.front == Token.name); 382 assert(symbolTable.symbol(lexer.value) == "_foo"); 383 assert(lexer.ok); 384 } 385 } 386 387 @("read name with umlauts") 388 unittest 389 { 390 with (fixture("äöü")) 391 { 392 assert(lexer.front == Token.name); 393 assert(symbolTable.symbol(lexer.value) == "äöü"); 394 assert(lexer.ok); 395 } 396 } 397 398 /** 399 * number: digit { digit }. 400 */ 401 private void readNumber() 402 in (input.next.isNumber) 403 { 404 const begin = input.index; 405 406 scope (exit) 407 value_ = symbolTable.intern(input.sliceFrom(begin)); 408 do 409 input.popFront; 410 while (input.next.isNumber); 411 } 412 413 @("read number") 414 unittest 415 { 416 with (fixture("42")) 417 { 418 assert(lexer.front == Token.number); 419 assert(symbolTable.symbol(lexer.value) == "42"); 420 assert(lexer.ok); 421 } 422 } 423 424 private void addTrace() 425 { 426 import log : trace; 427 428 switch (token) with (Token) 429 { 430 case end: 431 trace!"end\n%s"(position_); 432 break; 433 case string_: 434 trace!"string: %s\n%s"(symbolTable.symbol(value_), position_); 435 break; 436 case name: 437 trace!"name: %s\n%s"(symbolTable.symbol(value_), position_); 438 break; 439 case number: 440 trace!"number: %s\n%s"(symbolTable.symbol(value_), position_); 441 break; 442 default: 443 trace!"%s\n%s"(token, position_); 444 break; 445 } 446 } 447 448 private void addError(string message) 449 { 450 import log : error; 451 452 ++errorCount; 453 error!"%s\n%s"(message, position_); 454 } 455 456 bool ok() const @nogc nothrow pure @safe 457 { 458 return errorCount == 0; 459 } 460 461 size_t value() const @nogc nothrow pure @safe 462 { 463 return value_; 464 } 465 466 Position position() const @nogc nothrow pure @safe 467 { 468 return position_; 469 } 470 } 471 472 private dchar next(ref Input input) 473 { 474 return input.empty ? 0 : input.front; 475 } 476 477 enum Token : dchar 478 { 479 end = 0, 480 string_ = '"', 481 name = 'A', 482 number = '0', 483 } 484 485 version (unittest) 486 { 487 private auto fixture(string text) 488 { 489 struct Fixture 490 { 491 Lexer lexer; 492 493 SymbolTable symbolTable; 494 } 495 496 auto symbolTable = new SymbolTable; 497 498 return Fixture(Lexer(Input("name", text), symbolTable), symbolTable); 499 } 500 }