Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Include/internal/pycore_global_objects_fini_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions Include/internal/pycore_global_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,30 @@ struct _Py_global_strings {
STRUCT_FOR_STR(list_err, "list index out of range")
STRUCT_FOR_STR(native, "<native>")
STRUCT_FOR_STR(str_replace_inf, "1e309")
STRUCT_FOR_STR(token_amperequal, "&=")
STRUCT_FOR_STR(token_atequal, "@=")
STRUCT_FOR_STR(token_circumflexequal, "^=")
STRUCT_FOR_STR(token_colonequal, ":=")
STRUCT_FOR_STR(token_double_slash, "//")
STRUCT_FOR_STR(token_double_slashequal, "//=")
STRUCT_FOR_STR(token_doublestar, "**")
STRUCT_FOR_STR(token_doublestarequal, "**=")
STRUCT_FOR_STR(token_ellipsis, "...")
STRUCT_FOR_STR(token_eqequal, "==")
STRUCT_FOR_STR(token_greaterequal, ">=")
STRUCT_FOR_STR(token_leftshift, "<<")
STRUCT_FOR_STR(token_leftshiftequal, "<<=")
STRUCT_FOR_STR(token_lessequal, "<=")
STRUCT_FOR_STR(token_minequal, "-=")
STRUCT_FOR_STR(token_notequal, "!=")
STRUCT_FOR_STR(token_percentequal, "%=")
STRUCT_FOR_STR(token_plusequal, "+=")
STRUCT_FOR_STR(token_rarrow, "->")
STRUCT_FOR_STR(token_rightshift, ">>")
STRUCT_FOR_STR(token_rightshiftequal, ">>=")
STRUCT_FOR_STR(token_slashequal, "/=")
STRUCT_FOR_STR(token_starequal, "*=")
STRUCT_FOR_STR(token_vbarequal, "|=")
STRUCT_FOR_STR(type_params, ".type_params")
STRUCT_FOR_STR(utf_8, "utf-8")
} literals;
Expand Down
24 changes: 24 additions & 0 deletions Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

96 changes: 96 additions & 0 deletions Include/internal/pycore_unicodeobject_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1885,6 +1885,45 @@ def test_exact_type(self):
token.NAME, token.AMPER, token.NUMBER,
token.RPAR)

def test_exact_operator_token_strings_are_reused(self):
operators = (
'...', '->', '!=', '%=', '&=', '**', '**=', '*=',
'+=', '-=', '//', '//=', '/=', ':=', '<<', '<<=',
'<=', '==', '>=', '>>', '>>=', '@=', '^=', '|=',
)
for op in operators:
with self.subTest(op=op):
source = f'{op} {op}\n'.encode()
tokens = list(tokenize.tokenize(BytesIO(source).readline))
matches = [tok.string for tok in tokens if tok.string == op]
self.assertEqual(len(matches), 2)
self.assertIs(matches[0], matches[1])
Comment on lines +1898 to +1900

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use sys._is_interned instead of relying on is.


def test_exact_operator_token_strings_are_reused_by_c_tokenizer(self):
operators = (
'...', '->', '!=', '%=', '&=', '**', '**=', '*=',
'+=', '-=', '//', '//=', '/=', ':=', '<<', '<<=',
'<=', '==', '>=', '>>', '>>=', '@=', '^=', '|=',
)
for op in operators:
with self.subTest(op=op):
source = BytesIO(f'{op} {op}\n'.encode())
tokens = list(tokenize._tokenize.TokenizerIter(
source.readline, encoding='utf-8', extra_tokens=True))
Comment on lines +1911 to +1912

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not a fan of getting _tokenize this way, nor do I think this test stresses anything different from the first one. Let's keep just the first test and remove this one.

matches = [tok[1] for tok in tokens if tok[1] == op]
self.assertEqual(len(matches), 2)
self.assertIs(matches[0], matches[1])

def test_old_not_equal_spelling_is_not_rewritten(self):
source = BytesIO(
b'from __future__ import barry_as_FLUFL\n'
b'a <> b\n'
)
tokens = list(tokenize._tokenize.TokenizerIter(
source.readline, encoding='utf-8', extra_tokens=True))
self.assertIn('<>', [tok[1] for tok in tokens])
self.assertNotIn('!=', [tok[1] for tok in tokens])
Comment on lines +1917 to +1925

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test passes on main and isn't related to this change; please remove it.


def test_pathological_trailing_whitespace(self):
# See http://bugs.python.org/issue16152
self.assertExactTypeEqual('@ ', token.AT)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Reduce duplicate string allocations in :mod:`tokenize` by reusing static
strings for exact multi-character operator tokens.
75 changes: 74 additions & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,38 @@
#include "Python.h"
#include "errcode.h"
#include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
#include "internal/pycore_global_strings.h" // _Py_DECLARE_STR()
#include "internal/pycore_tuple.h" // _PyTuple_FromPair
#include "../Parser/lexer/state.h"
#include "../Parser/lexer/lexer.h"
#include "../Parser/tokenizer/tokenizer.h"
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()

_Py_DECLARE_STR(token_atequal, "@=")
_Py_DECLARE_STR(token_circumflexequal, "^=")
_Py_DECLARE_STR(token_colonequal, ":=")
_Py_DECLARE_STR(token_double_slash, "//")
_Py_DECLARE_STR(token_double_slashequal, "//=")
_Py_DECLARE_STR(token_doublestar, "**")
_Py_DECLARE_STR(token_doublestarequal, "**=")
_Py_DECLARE_STR(token_ellipsis, "...")
_Py_DECLARE_STR(token_eqequal, "==")
_Py_DECLARE_STR(token_greaterequal, ">=")
_Py_DECLARE_STR(token_leftshift, "<<")
_Py_DECLARE_STR(token_leftshiftequal, "<<=")
_Py_DECLARE_STR(token_lessequal, "<=")
_Py_DECLARE_STR(token_minequal, "-=")
_Py_DECLARE_STR(token_notequal, "!=")
_Py_DECLARE_STR(token_percentequal, "%=")
_Py_DECLARE_STR(token_plusequal, "+=")
_Py_DECLARE_STR(token_rarrow, "->")
_Py_DECLARE_STR(token_rightshift, ">>")
_Py_DECLARE_STR(token_rightshiftequal, ">>=")
_Py_DECLARE_STR(token_slashequal, "/=")
_Py_DECLARE_STR(token_starequal, "*=")
_Py_DECLARE_STR(token_vbarequal, "|=")
_Py_DECLARE_STR(token_amperequal, "&=")

static struct PyModuleDef _tokenizemodule;

typedef struct {
Expand Down Expand Up @@ -42,6 +68,49 @@ typedef struct
Py_ssize_t byte_col_offset_diff;
} tokenizeriterobject;

static PyObject *
get_static_exact_token_str(int type, const char *start, Py_ssize_t len)
{
#define RETURN_STATIC_TOKEN_STR(TYPE, NAME, LITERAL) \
case TYPE: \
if (len == (Py_ssize_t)(sizeof(LITERAL) - 1) \
&& memcmp(start, LITERAL, sizeof(LITERAL) - 1) == 0) \
{ \
return Py_NewRef(&_Py_STR(NAME)); \
} \
break

switch (type) {
RETURN_STATIC_TOKEN_STR(ATEQUAL, token_atequal, "@=");
RETURN_STATIC_TOKEN_STR(CIRCUMFLEXEQUAL, token_circumflexequal, "^=");
RETURN_STATIC_TOKEN_STR(COLONEQUAL, token_colonequal, ":=");
RETURN_STATIC_TOKEN_STR(DOUBLESLASH, token_double_slash, "//");
RETURN_STATIC_TOKEN_STR(DOUBLESLASHEQUAL, token_double_slashequal, "//=");
RETURN_STATIC_TOKEN_STR(DOUBLESTAR, token_doublestar, "**");
RETURN_STATIC_TOKEN_STR(DOUBLESTAREQUAL, token_doublestarequal, "**=");
RETURN_STATIC_TOKEN_STR(ELLIPSIS, token_ellipsis, "...");
RETURN_STATIC_TOKEN_STR(EQEQUAL, token_eqequal, "==");
RETURN_STATIC_TOKEN_STR(GREATEREQUAL, token_greaterequal, ">=");
RETURN_STATIC_TOKEN_STR(LEFTSHIFT, token_leftshift, "<<");
RETURN_STATIC_TOKEN_STR(LEFTSHIFTEQUAL, token_leftshiftequal, "<<=");
RETURN_STATIC_TOKEN_STR(LESSEQUAL, token_lessequal, "<=");
RETURN_STATIC_TOKEN_STR(MINEQUAL, token_minequal, "-=");
RETURN_STATIC_TOKEN_STR(NOTEQUAL, token_notequal, "!=");
RETURN_STATIC_TOKEN_STR(PERCENTEQUAL, token_percentequal, "%=");
RETURN_STATIC_TOKEN_STR(PLUSEQUAL, token_plusequal, "+=");
RETURN_STATIC_TOKEN_STR(RARROW, token_rarrow, "->");
RETURN_STATIC_TOKEN_STR(RIGHTSHIFT, token_rightshift, ">>");
RETURN_STATIC_TOKEN_STR(RIGHTSHIFTEQUAL, token_rightshiftequal, ">>=");
RETURN_STATIC_TOKEN_STR(SLASHEQUAL, token_slashequal, "/=");
RETURN_STATIC_TOKEN_STR(STAREQUAL, token_starequal, "*=");
RETURN_STATIC_TOKEN_STR(VBAREQUAL, token_vbarequal, "|=");
RETURN_STATIC_TOKEN_STR(AMPEREQUAL, token_amperequal, "&=");
}

#undef RETURN_STATIC_TOKEN_STR
return NULL;
}

/*[clinic input]
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
Expand Down Expand Up @@ -268,7 +337,11 @@ tokenizeriter_next(PyObject *op)
str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
}
else {
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
Py_ssize_t len = token.end - token.start;
str = get_static_exact_token_str(type, token.start, len);
if (str == NULL) {
str = PyUnicode_FromStringAndSize(token.start, len);
}
}
if (str == NULL) {
goto exit;
Expand Down
Loading