From 9490864ef3fd88423cef0487a4581c0e4221e0a6 Mon Sep 17 00:00:00 2001 From: Alexander Luzgarev Date: Thu, 18 Oct 2018 18:48:08 +0200 Subject: [PATCH] More diagnostics for the lexer --- ConsoleDemo/Program.cs | 8 +---- Parser/Internal/DiagnosticsBag.cs | 15 +++++++++ Parser/Internal/MLexerGreen.cs | 52 +++++++++++++++++++++++-------- Parser/Internal/SyntaxFacts.cs | 10 ++++++ Parser/TokenKind.cs | 13 ++++---- 5 files changed, 72 insertions(+), 26 deletions(-) diff --git a/ConsoleDemo/Program.cs b/ConsoleDemo/Program.cs index 18982a0..b46404d 100644 --- a/ConsoleDemo/Program.cs +++ b/ConsoleDemo/Program.cs @@ -76,13 +76,7 @@ namespace ConsoleDemo private static void ParserDemo() { Console.WriteLine("Hello World!"); - var text = @" - function [a, b c] = functionName(d, e, f) - a = d + e; - end -%{ -comment - "; + var text = @"x = 'abc"; var window = new TextWindowWithNull(text, "noname"); var parser = CreateParser(window); var tree = parser.Parse(); diff --git a/Parser/Internal/DiagnosticsBag.cs b/Parser/Internal/DiagnosticsBag.cs index 1e2bdc6..c54cf40 100644 --- a/Parser/Internal/DiagnosticsBag.cs +++ b/Parser/Internal/DiagnosticsBag.cs @@ -25,6 +25,21 @@ namespace Parser.Internal Report(span, "Unexpected end of file."); } + internal void ReportUnexpectedCharacterWhileParsingNumber(TextSpan span, char c) + { + Report(span, $"Unexpected character '{c}' while parsing a number."); + } + + internal void ReportUnexpectedEOLWhileParsingString(TextSpan span) + { + Report(span, "Unexpected end of line while parsing a string literal."); + } + + internal void ReportUnknownSymbol(TextSpan span, char c) + { + Report(span, $"Unknown symbol '{c}'."); + } + public IEnumerator GetEnumerator() { return _diagnostics.GetEnumerator(); diff --git a/Parser/Internal/MLexerGreen.cs b/Parser/Internal/MLexerGreen.cs index eabd4b1..2156128 100644 --- a/Parser/Internal/MLexerGreen.cs +++ b/Parser/Internal/MLexerGreen.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using System.Text; @@ -230,7 +231,7 @@ namespace Parser.Internal } else { - fail = true; + throw new Exception($"Unexpected symbol '{c}' at the beginning of number literal."); } break; case NumberParsingState.DigitsBeforeDot: @@ -328,7 +329,10 @@ namespace Parser.Internal if (fail) { - throw new ParsingException("Error while parsing number."); + var s = Window.GetAndConsumeChars(n); + tokenInfo.Kind = TokenKind.NumberLiteral; + tokenInfo.Text = s; + return false; } if (success) @@ -368,6 +372,7 @@ namespace Parser.Internal private bool ContinueLexingGeneralStringLiteral(ref TokenInfo tokenInfo, char quote) { + var status = 0; // no errors Window.ConsumeChar(); var textBuilder = new StringBuilder(); textBuilder.Append(quote); @@ -394,9 +399,15 @@ namespace Parser.Internal break; } } - if (SyntaxFacts.IsEolOrEof(Window.PeekChar(n))) + if (SyntaxFacts.IsEof(Window.PeekChar(n))) { - throw new ParsingException("Unfinished string literal."); + status = 1; + break; + } + if (SyntaxFacts.IsEol(Window.PeekChar(n))) + { + status = 2; + break; } n++; } @@ -404,11 +415,24 @@ namespace Parser.Internal var lastPiece = Window.GetAndConsumeChars(n); textBuilder.Append(lastPiece); valueBuilder.Append(lastPiece); - Window.ConsumeChar(); - textBuilder.Append(quote); + switch (status) { + case 0: + Window.ConsumeChar(); + textBuilder.Append(quote); + break; + case 1: + Diagnostics.ReportUnexpectedEndOfFile(new TextSpan(Window.Position.Offset, 1)); + break; + case 2: + Diagnostics.ReportUnexpectedEOLWhileParsingString(new TextSpan(Window.Position.Offset, 1)); + break; + default: + throw new Exception($"Unexpected status of parsing string literal: {status}."); + } + tokenInfo.Text = textBuilder.ToString(); tokenInfo.StringValue = valueBuilder.ToString(); - return true; + return status == 0; } private bool ContinueLexingStringLiteral(ref TokenInfo tokenInfo) @@ -520,7 +544,7 @@ namespace Parser.Internal var parsedNumber = ContinueLexingNumber(ref tokenInfo); if (!parsedNumber) { - throw new ParsingException($"Unexpected character \"{Window.PeekChar()}\" while parsing a number"); + Diagnostics.ReportUnexpectedCharacterWhileParsingNumber(new TextSpan(Window.Position.Offset, 1), Window.PeekChar()); } return true; case '=': @@ -542,7 +566,7 @@ namespace Parser.Internal var possiblyNumberToken2 = ContinueLexingNumber(ref tokenInfo); if (!possiblyNumberToken2) { - throw new ParsingException($"Unexpected character \"{Window.PeekChar()}\" while parsing a number"); + Diagnostics.ReportUnexpectedCharacterWhileParsingNumber(new TextSpan(Window.Position.Offset, 1), Window.PeekChar()); } return true; @@ -732,9 +756,11 @@ namespace Parser.Internal tokenInfo.Kind = TokenKind.EndOfFile; return true; default: - throw new ParsingException( - $"Unknown symbol \"{character}\" at {Window.Position}." - ); + Diagnostics.ReportUnknownSymbol(new TextSpan(Window.Position.Offset, 1), character); + Window.ConsumeChar(); + tokenInfo.Kind = TokenKind.BadToken; + tokenInfo.Text = character.ToString(); + return true; } } diff --git a/Parser/Internal/SyntaxFacts.cs b/Parser/Internal/SyntaxFacts.cs index 0971672..39eaeff 100644 --- a/Parser/Internal/SyntaxFacts.cs +++ b/Parser/Internal/SyntaxFacts.cs @@ -94,6 +94,16 @@ namespace Parser.Internal return c == '\n' || c == '\r' || c == '\0'; } + public static bool IsEof(char c) + { + return c == '\0'; + } + + public static bool IsEol(char c) + { + return c == '\n' || c == '\r'; + } + public static bool IsWhitespace(char c) { return c == ' ' || c == '\t' || c == '\n'; diff --git a/Parser/TokenKind.cs b/Parser/TokenKind.cs index 5769140..735ca0f 100644 --- a/Parser/TokenKind.cs +++ b/Parser/TokenKind.cs @@ -6,22 +6,23 @@ // SYNTAX TOKENS None = 0, + BadToken = 1, // The lexer puts a virtual "end of file" token at the end of the parsed file. - EndOfFile = 1, + EndOfFile = 2, // Identifier: could be a reserved word, a variable name, a class name, etc. - Identifier = 2, + Identifier = 3, // Number literal: 123, 45.678, 2e-5, etc. - NumberLiteral = 3, + NumberLiteral = 4, // String literal: 'abc', '123', etc. The "usual" string literals are single-quoted and are just char arrays. - StringLiteral = 4, + StringLiteral = 5, // Double-quoted string literal: "abc", "123", etc. These are the "new" string literal that are more like strings // and less like char arrays (for example, char arrays could be columns instead of rows, or even multi-dimensional). - DoubleQuotedStringLiteral = 5, + DoubleQuotedStringLiteral = 6, // This is for supporting "command statements" like // > cd some/+folder/ // In this example, "some/folder" should be treated as a string literal (for example, "+' there should be a part // of it, and not parsed as a binary operator). - UnquotedStringLiteral = 6, + UnquotedStringLiteral = 7, // trivia