Add lexer, parser and AST debug printer

Implements tokenizer for basic tokens (INT, ID, ASSIGN, PLUS, MINUS, NEWLINE, PRINT), recursive descent parser for assignments and binary expressions, and tree-formatted AST visualization with ast_debug().
2026-02-16 01:36:41 +01:00
parent d14227efeb
commit e2896fac5b
3 changed files with 332 additions and 4 deletions
--- a/src/frontend/lexer.h
+++ b/src/frontend/lexer.h
@@ -0,0 +1,114 @@
+/*
+    Convierte texto en una lista de tokens
+*/
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef enum {
+  // Literales
+  TOK_INT,    // 42
+  TOK_STRING, // "hola"
+
+  // Identificadores y keywords
+  TOK_ID,    // x, foo, mi_var
+  TOK_PRINT, // print
+  TOK_IF,    // if
+  TOK_WHILE, // while
+
+  // Operadores
+  TOK_ASSIGN, // =
+  TOK_PLUS,   // +
+  TOK_MINUS,  // -
+  TOK_STAR,   // *
+  TOK_SLASH,  // /
+  TOK_EQ,     // ==
+  TOK_NEQ,    // !=
+  TOK_LT,     // <
+  TOK_GT,     // >
+
+  // Delimitadores
+  TOK_LPAREN,  // (
+  TOK_RPAREN,  // )
+  TOK_COLON,   // :
+  TOK_NEWLINE, // \n (significativo, como en Python)
+  TOK_INDENT,  // aumento de indentacion
+  TOK_DEDENT,  // reduccion de indentacion
+
+  TOK_EOF
+
+} TokenType;
+
+typedef struct {
+  TokenType type;
+  char *value;
+  int line;
+} Token;
+
+Token make_token(TokenType type, const char *value) {
+  Token t;
+  t.type = type;
+  t.value = (char *)value;
+  return t;
+}
+
+char *substr(const char *src, int start, int end) {
+  int len = end - start;
+  char *s = (char *)malloc(len + 1);
+  memcpy(s, src + start, len);
+  s[len] = '\0';
+  return s;
+}
+
+Token *tokenize(const char *source, int *token_count) {
+  Token *tokens =
+      (Token *)malloc(sizeof(Token) * 1024); // 1024 tokens por ahora de maximo
+  int count = 0;
+  int pos = 0;
+
+  while (source[pos] != '\0' && pos < strlen(source)) {
+    char c = source[pos];
+
+    if (c == ' ') {
+      pos++;
+    } else if (c == '\n') {
+      tokens[count++] = make_token(TOK_NEWLINE, "\n");
+      pos++;
+    } else if (c == '+') {
+      tokens[count++] = make_token(TOK_PLUS, "+");
+      pos++;
+    } else if (c == '-') {
+      tokens[count++] = make_token(TOK_MINUS, "-");
+      pos++;
+    } else if (c == '=') {
+      tokens[count++] = make_token(TOK_ASSIGN, "=");
+      pos++;
+    } else if (c >= '0' && c <= '9') {
+      // Leer todos los digitos consecutivos
+      int start = pos;
+      while (source[pos] >= '0' && source[pos] <= '9')
+        pos++;
+      tokens[count++] = make_token(TOK_INT, substr(source, start, pos));
+    } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
+      // Leer todos los caracteres consecutivos
+      int start = pos;
+      while (isalnum(source[pos]))
+        pos++;
+      char *word = substr(source, start, pos);
+
+      // Comprobar si es una keyword reservada
+      if (strcmp(word, "print") == 0) {
+        tokens[count++] = make_token(TOK_PRINT, word);
+      } else {
+        tokens[count++] = make_token(TOK_ID, word);
+      }
+    } else {
+      printf("WARN: caracter no reconocido '%c' en pos %d\n", c, pos);
+      exit(1);
+    }
+  }
+
+  *token_count = count;
+  return tokens;
+}
--- a/src/frontend/parser.h
+++ b/src/frontend/parser.h
@@ -0,0 +1,208 @@
+/*
+    Convierte tokens en un arbol
+*/
+
+#include "lexer.h"
+
+typedef enum
+{
+    NODE_INT_LIT,    // literal entero
+    NODE_STRING_LIT, // literal string
+    NODE_VAR,        // referencia a variable
+    NODE_ASSIGN,     // asignacion: x = expr
+    NODE_BINOP,      // operacion binaria: a + b
+    NODE_PRINT,      // print(expr)
+    NODE_IF,         // if cond: bloque
+    NODE_WHILE,      // while cond: bloque
+    NODE_BLOCK,      // secuencia de statements
+} NodeType;
+
+typedef struct ASTNode
+{
+    NodeType type;
+    union
+    {
+        int int_val;      // NODE_INT_LIT
+        char *string_val; // NODE_STRING_LIT
+        struct
+        {
+            char *name;
+            struct ASTNode *value;
+        } assign; // NODE_ASSIGN
+        struct
+        {
+            char op;
+            struct ASTNode *left;
+            struct ASTNode *right;
+        } binop; // NODE_BINOP
+        struct
+        {
+            struct ASTNode *expr;
+        } print; // NODE_PRINT
+        struct
+        {
+            struct ASTNode **stmts;
+            int count;
+        } block; // NODE_BLOCK
+    } data;
+} ASTNode;
+
+ASTNode *make_node(NodeType type)
+{
+    ASTNode *node = (ASTNode *)malloc(sizeof(ASTNode));
+    node->type = type;
+    return node;
+}
+
+int pos = 0;
+
+ASTNode *parse_term(Token *tokens)
+{
+    if (tokens[pos].type == TOK_INT)
+    {
+        ASTNode *node = make_node(NODE_INT_LIT);
+        node->data.int_val = atoi(tokens[pos].value);
+        pos++;
+        return node;
+    }
+    else if (tokens[pos].type == TOK_ID)
+    {
+        ASTNode *node = make_node(NODE_VAR);
+        node->data.string_val = tokens[pos].value;
+        pos++;
+        return node;
+    }
+    printf("ERROR: esperaba INT o ID, encontré tipo %d\n", tokens[pos].type);
+    exit(1);
+}
+
+ASTNode *parse_expr(Token *tokens)
+{
+    ASTNode *left = parse_term(tokens);
+
+    while (tokens[pos].type == TOK_PLUS || tokens[pos].type == TOK_MINUS)
+    {
+        char op = tokens[pos].value[0]; // + o -
+        pos++;
+        ASTNode *right = parse_term(tokens);
+
+        ASTNode *binop = make_node(NODE_BINOP);
+        binop->data.binop.op = op;
+        binop->data.binop.left = left;
+        binop->data.binop.right = right;
+        left = binop; // encadenar: (a + b) + c
+    }
+    return left;
+}
+
+ASTNode *parse_statement(Token *tokens)
+{
+    if (tokens[pos].type == TOK_ID)
+    {
+        char *name = tokens[pos].value;
+        pos++; // consumir ID
+        pos++; // consumir "="
+        ASTNode *value = parse_expr(tokens);
+
+        ASTNode *node = make_node(NODE_ASSIGN);
+        node->data.assign.name = name;
+        node->data.assign.value = value;
+        return node;
+    }
+    if (tokens[pos].type == TOK_PRINT) {
+        pos++; // consumir "print"
+        ASTNode *expr = parse_expr(tokens);
+
+        ASTNode*node = make_node(NODE_PRINT);
+        node->data.print.expr = expr;
+        return node;
+    }
+
+    printf("ERROR: statement inesperado\n");
+    exit(1);
+}
+
+ASTNode *parse(Token *tokens, int token_count)
+{
+    ASTNode *block = make_node(NODE_BLOCK);
+    block->data.block.stmts = (ASTNode **)malloc(sizeof(ASTNode *) * 256);
+    block->data.block.count = 0;
+
+    while (pos < token_count)
+    {
+        if (tokens[pos].type == TOK_NEWLINE)
+        {
+            pos++; // Saltar newlines sueltos
+            continue;
+        }
+        block->data.block.stmts[block->data.block.count++] = parse_statement(tokens);
+
+        // Consumir newline despues del statement
+        if (pos < token_count && tokens[pos].type == TOK_NEWLINE)
+        {
+            pos++;
+        }
+    }
+    return block;
+}
+
+void ast_print(ASTNode *node, const char *prefix, int is_last)
+{
+    if (!node)
+        return;
+
+    printf("%s", prefix);
+    printf(is_last ? "`-- " : "|-- ");
+
+    // Construir nuevo prefijo para hijos
+    char new_prefix[256];
+    snprintf(new_prefix, sizeof(new_prefix), "%s%s", prefix, is_last ? "    " : "|   ");
+
+    switch (node->type)
+    {
+    case NODE_INT_LIT:
+        printf("NODE_INT_LIT(%d)\n", node->data.int_val);
+        break;
+
+    case NODE_STRING_LIT:
+        printf("NODE_STRING_LIT(\"%s\")\n", node->data.string_val);
+        break;
+
+    case NODE_VAR:
+        printf("NODE_VAR(\"%s\")\n", node->data.string_val);
+        break;
+
+    case NODE_ASSIGN:
+        printf("NODE_ASSIGN { name:\"%s\" }\n", node->data.assign.name);
+        ast_print(node->data.assign.value, new_prefix, 1);
+        break;
+
+    case NODE_BINOP:
+        printf("NODE_BINOP('%c')\n", node->data.binop.op);
+        ast_print(node->data.binop.left, new_prefix, 0);
+        ast_print(node->data.binop.right, new_prefix, 1);
+        break;
+
+    case NODE_PRINT:
+        printf("NODE_PRINT\n");
+        ast_print(node->data.print.expr, new_prefix, 1);
+        break;
+
+    case NODE_BLOCK:
+        printf("NODE_BLOCK\n");
+        for (int i = 0; i < node->data.block.count; i++)
+        {
+            ast_print(node->data.block.stmts[i], new_prefix, i == node->data.block.count - 1);
+        }
+        break;
+
+    default:
+        printf("UNKNOWN\n");
+        break;
+    }
+}
+
+void ast_debug(ASTNode *node)
+{
+    ast_print(node, "", 1);
+}
--- a/src/main.c
+++ b/src/main.c
@@ -1,3 +1,4 @@
+#include "frontend/parser.h"
 #include "objects/object.h"

 int main() {
@@ -22,11 +23,16 @@ int main() {
  items[1] = stringVar1;
  items[2] = stringVar1;

-
-
  obj_print(allocPtr, listVar1, "");

  JLANG_visualize(allocPtr);

+  // Lexer test
+  int totalTokens = 0;
+  Token *tokens = tokenize("x = 10\ny = 5\nz = x + y\nprint z", &totalTokens);
+  printf("totalTokens=%d\n", totalTokens);
+  ASTNode* block = parse(tokens, totalTokens);
+  ast_debug(block);
+
  return 0;
 }