从‘Hello World’到编译器:用Python手写一个简单的语法树生成器(附完整代码)

从‘Hello World’到编译器:用Python手写一个简单的语法树生成器(附完整代码) 从‘Hello World’到编译器用Python手写一个简单的语法树生成器附完整代码编译原理常被视为计算机科学中最艰深的领域之一那些晦涩的术语和抽象概念让许多开发者望而却步。但当我第一次看到算术表达式被拆解成树状结构时突然意识到语法树不过是代码的另一种可视化形式。本文将用不到200行Python代码带您实现一个能解析四则运算并生成语法树的微型编译器前端这种从做中学的方式远比死记硬背文法规则更有效。1. 环境准备与基础概念在开始编码前我们需要明确几个关键概念。上下文无关文法Context-Free Grammar是描述编程语言语法的标准工具它由一组产生式规则组成。以简单算术表达式为例其文法可以表示为Expr → Term Expr | Term - Expr | Term Term → Factor * Term | Factor / Term | Factor Factor → ( Expr ) | Number Number → [0-9]这个文法清晰地定义了运算优先级括号乘除加减。递归下降解析器正是基于这种分层结构设计的——每个非终结符如Expr、Term对应一个解析函数形成自然的调用层次。准备Python 3.8环境我们只需要标准库import sys import re from dataclasses import dataclass from typing import Union, List2. 词法分析器实现词法分析器Lexer负责将原始字符串转换为标记流Token Stream。我们首先定义标记类型dataclass class Token: type: str # 类型如NUMBER, PLUS, LPAREN等 value: Union[str, int, float] pos: int # 在源字符串中的位置 class Lexer: TOKEN_SPEC [ (NUMBER, r\d(\.\d*)?), (PLUS, r\), (MINUS, r-), (MUL, r\*), (DIV, r/), (LPAREN, r\(), (RPAREN, r\)), (SKIP, r[ \t]), # 忽略空格和制表符 ] def __init__(self, text): self.text text self.pos 0 self.tokens self._tokenize() def _tokenize(self): tokens [] while self.pos len(self.text): for token_type, pattern in self.TOKEN_SPEC: regex re.compile(pattern) match regex.match(self.text, self.pos) if match: value match.group() if token_type ! SKIP: if token_type NUMBER: value float(value) if . in value else int(value) tokens.append(Token(token_type, value, self.pos)) self.pos match.end() break else: raise SyntaxError(fUnexpected character: {self.text[self.pos]}) return tokens测试词法分析器lexer Lexer(3 4 * (10 - 5)) print([(t.type, t.value) for t in lexer.tokens]) # 输出: [(NUMBER, 3), (PLUS, ), (NUMBER, 4), (MUL, *), # (LPAREN, (), (NUMBER, 10), (MINUS, -), (NUMBER, 5), (RPAREN, ))]3. 递归下降语法分析语法分析器Parser将标记流转换为抽象语法树AST。我们首先定义AST节点类型class ASTNode: pass dataclass class BinOp(ASTNode): left: ASTNode op: Token right: ASTNode dataclass class Num(ASTNode): token: Token dataclass class UnaryOp(ASTNode): op: Token expr: ASTNode递归下降解析器的核心是分层解析函数每个函数对应文法中的一个非终结符class Parser: def __init__(self, tokens): self.tokens tokens self.current_token None self.pos -1 self.advance() def advance(self): self.pos 1 self.current_token self.tokens[self.pos] if self.pos len(self.tokens) else None def parse(self): return self.expr() def expr(self): node self.term() while self.current_token and self.current_token.type in (PLUS, MINUS): op self.current_token self.advance() node BinOp(leftnode, opop, rightself.term()) return node def term(self): node self.factor() while self.current_token and self.current_token.type in (MUL, DIV): op self.current_token self.advance() node BinOp(leftnode, opop, rightself.factor()) return node def factor(self): token self.current_token if token.type LPAREN: self.advance() node self.expr() if self.current_token.type ! RPAREN: raise SyntaxError(Expected closing parenthesis) self.advance() return node elif token.type NUMBER: self.advance() return Num(token) elif token.type in (PLUS, MINUS): self.advance() return UnaryOp(optoken, exprself.factor()) else: raise SyntaxError(fUnexpected token: {token.type})测试解析器tokens Lexer(3 4 * (10 - 5)).tokens ast Parser(tokens).parse() print(ast) # 输出类似: BinOp(leftNum(tokenToken(typeNUMBER, value3, pos0)), # opToken(typePLUS, value, pos2), # rightBinOp(leftNum(tokenToken(typeNUMBER, value4, pos4)), # opToken(typeMUL, value*, pos6), # right...))4. 可视化语法树为了直观理解解析结果我们实现一个简单的树形打印器def print_ast(node, indent0): if isinstance(node, Num): print( * indent fNumber({node.token.value})) elif isinstance(node, UnaryOp): print( * indent fUnaryOp({node.op.value})) print_ast(node.expr, indent 2) elif isinstance(node, BinOp): print( * indent fBinOp({node.op.value})) print_ast(node.left, indent 2) print_ast(node.right, indent 2) print_ast(ast) 输出示例: BinOp() Number(3) BinOp(*) Number(4) BinOp(-) Number(10) Number(5) 更专业的可视化可以使用Graphviz生成图片。添加以下代码from graphviz import Digraph def visualize_ast(node, graphNone): if graph is None: graph Digraph() graph.node(str(id(node)), labelfRoot: {node.__class__.__name__}) if isinstance(node, Num): graph.node(str(id(node)), labelfNumber\n{node.token.value}) elif isinstance(node, UnaryOp): graph.node(str(id(node)), labelfUnary {node.op.value}) graph.edge(str(id(node)), str(id(node.expr))) visualize_ast(node.expr, graph) elif isinstance(node, BinOp): graph.node(str(id(node)), labelfBinary {node.op.value}) graph.edge(str(id(node)), str(id(node.left))) graph.edge(str(id(node)), str(id(node.right))) visualize_ast(node.left, graph) visualize_ast(node.right, graph) return graph visualize_ast(ast).render(ast, viewTrue) # 生成PDF可视化文件5. 错误处理与扩展建议当前实现已经能处理基本算术表达式但还需要增强健壮性错误恢复当遇到语法错误时可以尝试跳过当前标记继续解析更丰富的语法支持变量、函数调用等特性语义分析检查类型一致性等语义规则添加错误恢复的改进版解析器示例class AdvancedParser(Parser): def _error(self, message): print(fSyntaxError at position {self.current_token.pos}: {message}) # 尝试跳过当前标记继续解析 self.advance() return None def factor(self): try: token self.current_token if not token: return None if token.type LPAREN: self.advance() node self.expr() if not self.current_token or self.current_token.type ! RPAREN: return self._error(Expected )) self.advance() return node elif token.type NUMBER: self.advance() return Num(token) elif token.type in (PLUS, MINUS): self.advance() expr self.factor() return UnaryOp(optoken, exprexpr) if expr else None else: return self._error(fUnexpected token: {token.type}) except IndexError: return self._error(Unexpected end of input)实际项目中您可能还需要考虑以下优化词法分析性能使用预编译的正则表达式语法树遍历实现Visitor模式便于后续分析运算符优先级表动态调整优先级而不用修改文法# Visitor模式示例 class ASTVisitor: def visit(self, node): method_name fvisit_{type(node).__name__} visitor getattr(self, method_name, self.generic_visit) return visitor(node) def generic_visit(self, node): raise Exception(fNo visit_{type(node).__name__} method) class EvalVisitor(ASTVisitor): def visit_Num(self, node): return node.token.value def visit_BinOp(self, node): left self.visit(node.left) right self.visit(node.right) if node.op.type PLUS: return left right elif node.op.type MINUS: return left - right elif node.op.type MUL: return left * right elif node.op.type DIV: return left / right def visit_UnaryOp(self, node): value self.visit(node.expr) return -value if node.op.type MINUS else value eval_visitor EvalVisitor() result eval_visitor.visit(ast) print(f计算结果: {result}) # 输出: 计算结果: 23.0这个微型编译器前端虽然简单但已经包含了现代编译器的核心思想。当您下次使用Babel或TypeScript编译器时会发现它们本质上也是在构建和转换语法树——只是处理的语法更复杂优化的程度更深而已。