From 36ec04c00fa540fcee0f2cff1f7b81dd8a98101a Mon Sep 17 00:00:00 2001
From: Ferenc Szontágh <szf@fsociety.hu>
Date: Thu, 17 Apr 2025 18:44:58 +0000
Subject: [PATCH] some refactor
---
src/Lexer/Lexer.cpp | 307 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 299 insertions(+), 8 deletions(-)
diff --git a/src/Lexer/Lexer.cpp b/src/Lexer/Lexer.cpp
index 6f6cc35..b5a4261 100644
--- a/src/Lexer/Lexer.cpp
+++ b/src/Lexer/Lexer.cpp
@@ -1,14 +1,305 @@
#include "Lexer/Lexer.hpp"
-namespace Lexer {
-const std::vector<std::string> Lexer::Lexer::OPERATOR_RELATIONAL = { "==", "!=", "<", ">", "<=", ">=" };
-const std::vector<std::string> Lexer::Lexer::OPERATOR_INCREMENT = { "++", "--" };
-const std::vector<std::string> Lexer::Lexer::OPERATOR_ASSIGNMENT = { "=", "+=", "-=", "*=", "/=", "%=" };
-const std::vector<std::string> Lexer::Lexer::OPERATOR_LOGICAL = { "&&", "||" };
+#include "Lexer/Operators.hpp"
+#include "Symbols/SymbolContainer.hpp"
-const std::vector<std::string> Lexer::Lexer::OPERATOR_ARITHMETIC = { "+", "-", "*", "/", "%" };
-const std::vector<std::string> Lexer::Lexer::PUNCTUATION = { "(", ")", "{", "}", "[", "]", ",", ";" };
+std::vector<Lexer::Tokens::Token> Lexer::Lexer::tokenizeNamespace(const std::string & ns) {
+ if (inputs_.find(ns) == inputs_.end()) {
+ return {};
+ }
+ Symbols::SymbolContainer::instance()->enter(ns);
+ std::vector<Tokens::Token> tokens;
+ Tokens::Token token;
+ do {
+ token = nextToken();
+ tokens.push_back(token);
+ } while (token.type != Tokens::Type::END_OF_FILE);
-}; // namespace Lexer
+ tokens_[ns] = tokens;
+ return tokens;
+}
+
+void Lexer::Lexer::addNamespaceInput(const std::string & ns, const std::string & input) {
+ inputs_[ns] = input;
+ positions_[ns] = 0;
+ line_numbers_[ns] = 1;
+ column_numbers_[ns] = 1;
+}
+
+std::vector<Lexer::Tokens::Token> Lexer::Lexer::getTokens(const std::string & ns) const {
+ auto it = tokens_.find(ns);
+ if (it != tokens_.end()) {
+ return it->second;
+ }
+ return {};
+}
+
+void Lexer::Lexer::setKeyWords(const std::unordered_map<std::string, Tokens::Type> & new_keywords) {
+ keywords = new_keywords;
+}
+
+Lexer::Tokens::Token Lexer::Lexer::nextToken() {
+ skipWhitespaceAndComments();
+ size_t start = pos();
+
+ if (isAtEnd()) {
+ return createToken(Tokens::Type::END_OF_FILE, start, start);
+ }
+
+ char c = peek();
+ if (isalpha(c) || c == '_') {
+ return matchIdentifierOrKeyword(start);
+ }
+ if (isdigit(c) || (isdigit(c) && peek(1) == '.') || (c == '.' && isdigit(peek(1)))) {
+ return matchNumber(start);
+ }
+ if (c == '"' || c == '\'') {
+ return matchStringLiteral(start);
+ }
+ if (operators_.find(c) != std::string_view::npos) {
+ return matchOperatorOrPunctuation(start);
+ }
+
+ advance();
+ return createToken(Tokens::Type::UNKNOWN, start, pos());
+}
+
+const std::string & Lexer::Lexer::input() const {
+ const auto & ns = Symbols::SymbolContainer::instance()->currentScopeName();
+ auto it = inputs_.find(ns);
+ if (it != inputs_.end()) {
+ return it->second;
+ }
+ throw Exception("Input not found in namespace: " + ns);
+}
+
+size_t & Lexer::Lexer::pos() {
+ const auto & ns = Symbols::SymbolContainer::instance()->currentScopeName();
+ auto it = positions_.find(ns);
+ if (it != positions_.end()) {
+ return it->second;
+ }
+ throw Exception("Unknown position in namespace: " + ns);
+}
+
+int & Lexer::Lexer::line() {
+ const auto & ns = Symbols::SymbolContainer::instance()->currentScopeName();
+ auto it = line_numbers_.find(ns);
+ if (it != line_numbers_.end()) {
+ return it->second;
+ }
+ throw Exception("Unknown line number in namespace: " + ns);
+}
+
+int & Lexer::Lexer::col() {
+ const auto & ns = Symbols::SymbolContainer::instance()->currentScopeName();
+ auto it = column_numbers_.find(ns);
+ if (it != column_numbers_.end()) {
+ return it->second;
+ }
+ throw Exception("Unknown column number in namespace: " + ns);
+}
+
+Lexer::Tokens::Token Lexer::Lexer::createToken(Tokens::Type type, size_t start, size_t end, const std::string & value) {
+ Tokens::Token token;
+ token.type = type;
+ token.start_pos = start;
+ token.end_pos = end;
+ token.line_number = line();
+ token.column_number = col();
+ if (start <= end && end <= input().length()) {
+ token.lexeme = std::string_view(input()).substr(start, end - start);
+ token.value = value.empty() ? std::string(token.lexeme) : value;
+ }
+ return token;
+}
+
+char Lexer::Lexer::peek(size_t offset) const {
+ const auto & ns = Symbols::SymbolContainer::instance()->currentScopeName();
+ const auto & in = inputs_.at(ns);
+ size_t cp = positions_.at(ns);
+ if (cp + offset >= in.length()) {
+ return '\0';
+ }
+ return in[cp + offset];
+}
+
+char Lexer::Lexer::advance() {
+ char c = peek();
+ pos()++;
+ if (c == '\n') {
+ line()++;
+ col() = 1;
+ } else {
+ col()++;
+ }
+ return c;
+}
+
+bool Lexer::Lexer::isAtEnd() const {
+ const auto & ns = Symbols::SymbolContainer::instance()->currentScopeName();
+ return positions_.at(ns) >= inputs_.at(ns).length();
+}
+
+void Lexer::Lexer::skipWhitespaceAndComments() {
+ while (!isAtEnd()) {
+ char c = peek();
+ if (isspace(c)) {
+ advance();
+ } else if ((c == '/' && peek(1) == '/') || c == '#') {
+ while (!isAtEnd() && peek() != '\n') {
+ advance();
+ }
+ } else {
+ break;
+ }
+ }
+}
+
+Lexer::Tokens::Token Lexer::Lexer::matchIdentifierOrKeyword(size_t start_pos, Tokens::Type type) {
+ while (!isAtEnd() && (isalnum(peek()) || peek() == '_')) {
+ advance();
+ }
+ size_t end = pos();
+ std::string value = input().substr(start_pos, end - start_pos);
+ if (value.empty()) {
+ return createToken(Tokens::Type::UNKNOWN, start_pos, end);
+ }
+
+ if (type == Tokens::Type::IDENTIFIER) {
+ auto it = keywords.find(value);
+ if (it != keywords.end()) {
+ return createToken(it->second, start_pos, end);
+ }
+ }
+ return createToken(type, start_pos, end);
+}
+
+Lexer::Tokens::Token Lexer::Lexer::matchNumber(size_t start_pos) {
+ bool has_dot = false;
+
+ while (!isAtEnd()) {
+ if (isdigit(peek())) {
+ advance();
+ } else if (!has_dot && peek() == '.' && isdigit(peek(1))) {
+ has_dot = true;
+ advance(); // a pont
+ advance(); // az első számjegy a pont után
+ } else {
+ break;
+ }
+ }
+
+ size_t end = pos();
+ return createToken(Tokens::Type::NUMBER, start_pos, end);
+}
+
+Lexer::Tokens::Token Lexer::Lexer::matchStringLiteral(size_t start_pos) {
+ char opening_quote = peek();
+ advance(); // Skip opening quote
+ std::string value;
+ bool unterminated = false;
+
+ while (!isAtEnd()) {
+ char c = peek();
+ if (c == opening_quote) {
+ advance();
+ break;
+ }
+ if (c == '\\') {
+ advance();
+ char e = advance();
+ switch (e) {
+ case 'n':
+ value += '\n';
+ break;
+ case 't':
+ value += '\t';
+ break;
+ case '"':
+ value += opening_quote;
+ break;
+ case '\\':
+ value += '\\';
+ break;
+ default:
+ value += e;
+ break;
+ }
+ } else {
+ value += advance();
+ }
+ }
+
+ size_t end = pos();
+ if (unterminated) {
+ return createToken(Tokens::Type::UNKNOWN, start_pos, end, input().substr(start_pos, end - start_pos));
+ }
+ return createToken(Tokens::Type::STRING_LITERAL, start_pos, end, value);
+}
+
+Lexer::Tokens::Token Lexer::Lexer::matchOperatorOrPunctuation(size_t start_pos) {
+ char first_char = advance(); // Első karakter elfogyasztása
+
+ if (!isAtEnd()) {
+ char second_char = peek(0); // Következő karakter megnézése
+ std::string two_chars_str{ first_char, second_char };
+
+ const std::vector<std::pair<const std::vector<std::string> *, Tokens::Type>> two_char_op_types = {
+ { &OPERATOR_RELATIONAL, Tokens::Type::OPERATOR_RELATIONAL },
+ { &OPERATOR_INCREMENT, Tokens::Type::OPERATOR_INCREMENT },
+ { &OPERATOR_ASSIGNMENT, Tokens::Type::OPERATOR_ASSIGNMENT },
+ { &OPERATOR_LOGICAL, Tokens::Type::OPERATOR_LOGICAL }
+ };
+
+ for (const auto & [vec_ptr, type] : two_char_op_types) {
+ if (matchFromVector(*vec_ptr, two_chars_str)) {
+ advance(); // Második karakter elfogyasztása
+ size_t end_pos = pos();
+ return createToken(type, start_pos, end_pos);
+ }
+ }
+ }
+
+ std::string single_char_str(1, first_char);
+
+ if (single_char_str == "$") {
+ if (isalpha(peek(0)) || peek(0) == '_') {
+ return matchIdentifierOrKeyword(start_pos, Tokens::Type::VARIABLE_IDENTIFIER);
+ }
+ }
+
+ const std::vector<std::pair<const std::vector<std::string> *, Tokens::Type>> one_char_op_types = {
+ { &OPERATOR_ARITHMETIC, Tokens::Type::OPERATOR_ARITHMETIC },
+ { &OPERATOR_ASSIGNMENT, Tokens::Type::OPERATOR_ASSIGNMENT },
+ { &PUNCTUATION, Tokens::Type::PUNCTUATION }
+ };
+
+ for (const auto & [vec_ptr, type] : one_char_op_types) {
+ if (matchFromVector(*vec_ptr, single_char_str)) {
+ size_t end_pos = pos();
+ return createToken(type, start_pos, end_pos);
+ }
+ }
+
+ size_t end_pos = pos();
+ return createToken(Tokens::Type::UNKNOWN, start_pos, end_pos);
+}
+
+bool Lexer::Lexer::matchFromVector(const std::vector<std::string> & vec, const std::string & value) {
+ return std::find(vec.begin(), vec.end(), value) != vec.end();
+}
+
+Lexer::Lexer::Lexer() {
+ for (const auto & vecRef :
+ { std::cref(OPERATOR_ARITHMETIC), std::cref(OPERATOR_RELATIONAL), std::cref(OPERATOR_INCREMENT),
+ std::cref(OPERATOR_ASSIGNMENT), std::cref(OPERATOR_LOGICAL), std::cref(PUNCTUATION) }) {
+ for (const auto & str : vecRef.get()) {
+ operators_ += str;
+ }
+ }
+
+ operators_ += "$";
+}
--
Gitblit v1.9.3