gh-102856: Clean some of the PEP 701 tokenizer implementation by pablogsal · Pull Request #103634 · python/cpython
#include "unicodeobject.h" #include "bytesobject.h" #include "fileobject.h" #include "abstract.h"
/* Alternate tab spacing */ #define ALTTABSIZE 1
#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) #define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) #ifdef Py_DEBUG static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0);
/* Forward */
switch (cur) { case '{': if (tok_mode->last_expr_buffer != NULL) { PyMem_Free(tok_mode->last_expr_buffer); } tok_mode->last_expr_buffer = PyMem_Malloc(size); if (tok_mode->last_expr_buffer == NULL) { tok->done = E_NOMEM; return 0; } tok_mode->last_expr_size = size; tok_mode->last_expr_end = -1; strncpy(tok_mode->last_expr_buffer, tok->cur, size); break; case 0: case 0: if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { return 1; }
return 1; error: tok->done = E_NOMEM; return 0; }
static void
if (tok->tok_mode_stack_index > 0) { if (INSIDE_FSTRING(tok)) { return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'")); }
p_start = tok->start; p_end = tok->cur; tokenizer_mode *current_tok = TOK_NEXT_MODE(tok); current_tok->kind = TOK_FSTRING_MODE; current_tok->f_string_quote = quote; current_tok->f_string_quote_size = quote_size; current_tok->f_string_start = tok->start; current_tok->f_string_multi_line_start = tok->line_start; current_tok->last_expr_buffer = NULL; current_tok->last_expr_size = 0; current_tok->last_expr_end = -1; tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); the_current_tok->kind = TOK_FSTRING_MODE; the_current_tok->f_string_quote = quote; the_current_tok->f_string_quote_size = quote_size; the_current_tok->f_string_start = tok->start; the_current_tok->f_string_multi_line_start = tok->line_start; the_current_tok->last_expr_buffer = NULL; the_current_tok->last_expr_size = 0; the_current_tok->last_expr_end = -1;
switch (*tok->start) { case 'F': case 'f': current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; break; case 'R': case 'r': current_tok->f_string_raw = 1; the_current_tok->f_string_raw = 1; break; default: Py_UNREACHABLE(); }
current_tok->bracket_stack = 0; current_tok->bracket_mark[0] = 0; current_tok->bracket_mark_index = -1; the_current_tok->curly_bracket_depth = 0; the_current_tok->curly_bracket_expr_start_depth = -1; return MAKE_TOKEN(FSTRING_START); }
if (tok->tok_mode_stack_index > 0) { if (INSIDE_FSTRING(tok)) { /* When we are in an f-string, before raising the * unterminated string literal error, check whether * does the initial quote matches with f-strings quotes * and if it is, then this must be a missing '}' token * so raise the proper error */ tokenizer_mode *current_tok = TOK_GET_MODE(tok); if (current_tok->f_string_quote == quote && current_tok->f_string_quote_size == quote_size) { tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); if (the_current_tok->f_string_quote == quote && the_current_tok->f_string_quote_size == quote_size) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start)); } }
/* Punctuation character */ int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); if (is_punctuation && tok->tok_mode_stack_index > 0 && current_tok->bracket_mark_index >= 0) { int mark = *TOK_GET_BRACKET_MARK(current_tok); /* This code block gets executed before the bracket_stack is incremented if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { /* This code block gets executed before the curly_bracket_depth is incremented * by the `{` case, so for ensuring that we are on the 0th level, we need * to adjust it manually */ int cursor = current_tok->bracket_stack - (c != '{'); int cursor = current_tok->curly_bracket_depth - (c != '{');
if (cursor == 0 && !update_fstring_expr(tok, c)) { return MAKE_TOKEN(ENDMARKER); }
if (c == ':' && cursor == mark) { if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { current_tok->kind = TOK_FSTRING_MODE; p_start = tok->start; p_end = tok->cur;
if (tok->tok_mode_stack_index > 0) { current_tok->bracket_stack++; if (INSIDE_FSTRING(tok)) { current_tok->curly_bracket_depth++; } break; case ')': case ']': case '}': if (!tok->level) { if (tok->tok_mode_stack_index > 0 && !current_tok->bracket_stack && c == '}') { if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed")); } return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
if (tok->tok_mode_stack_index > 0) { current_tok->bracket_stack--; if (c == '}' && current_tok->bracket_stack == *TOK_GET_BRACKET_MARK(current_tok)) { current_tok->bracket_mark_index--; if (INSIDE_FSTRING(tok)) { current_tok->curly_bracket_depth--; if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { current_tok->curly_bracket_expr_start_depth--; current_tok->kind = TOK_FSTRING_MODE; } } break; default: break; }
if (!Py_UNICODE_ISPRINTABLE(c)) {
if ((start_char == '{' && peek1 != '{') || (start_char == '}' && peek1 != '}')) { if (start_char == '{') { current_tok->bracket_mark_index++; if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) { current_tok->curly_bracket_expr_start_depth++; if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } *TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack; } TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; return tok_get_normal_mode(tok, current_tok, token);
int in_format_spec = current_tok->last_expr_end != -1 && current_tok->bracket_mark_index >= 0; int in_format_spec = ( current_tok->last_expr_end != -1 && INSIDE_FSTRING_EXPR(current_tok) ); if (c == '{') { int peek = tok_nextc(tok); if (peek != '{' || in_format_spec) { tok_backup(tok, peek); tok_backup(tok, c); current_tok->bracket_mark_index++; if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) { current_tok->curly_bracket_expr_start_depth++; if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } *TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack; TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; p_start = tok->start; p_end = tok->cur;