Skip to content

Commit 8be2a0c

Browse files
committed
Reuse static strings for exact operator tokens
1 parent 2f064fb commit 8be2a0c

6 files changed

Lines changed: 281 additions & 1 deletion

File tree

Include/internal/pycore_global_objects_fini_generated.h

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_global_strings.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,30 @@ struct _Py_global_strings {
5353
STRUCT_FOR_STR(list_err, "list index out of range")
5454
STRUCT_FOR_STR(native, "<native>")
5555
STRUCT_FOR_STR(str_replace_inf, "1e309")
56+
STRUCT_FOR_STR(token_amperequal, "&=")
57+
STRUCT_FOR_STR(token_atequal, "@=")
58+
STRUCT_FOR_STR(token_circumflexequal, "^=")
59+
STRUCT_FOR_STR(token_colonequal, ":=")
60+
STRUCT_FOR_STR(token_double_slash, "//")
61+
STRUCT_FOR_STR(token_double_slashequal, "//=")
62+
STRUCT_FOR_STR(token_doublestar, "**")
63+
STRUCT_FOR_STR(token_doublestarequal, "**=")
64+
STRUCT_FOR_STR(token_ellipsis, "...")
65+
STRUCT_FOR_STR(token_eqequal, "==")
66+
STRUCT_FOR_STR(token_greaterequal, ">=")
67+
STRUCT_FOR_STR(token_leftshift, "<<")
68+
STRUCT_FOR_STR(token_leftshiftequal, "<<=")
69+
STRUCT_FOR_STR(token_lessequal, "<=")
70+
STRUCT_FOR_STR(token_minequal, "-=")
71+
STRUCT_FOR_STR(token_notequal, "!=")
72+
STRUCT_FOR_STR(token_percentequal, "%=")
73+
STRUCT_FOR_STR(token_plusequal, "+=")
74+
STRUCT_FOR_STR(token_rarrow, "->")
75+
STRUCT_FOR_STR(token_rightshift, ">>")
76+
STRUCT_FOR_STR(token_rightshiftequal, ">>=")
77+
STRUCT_FOR_STR(token_slashequal, "/=")
78+
STRUCT_FOR_STR(token_starequal, "*=")
79+
STRUCT_FOR_STR(token_vbarequal, "|=")
5680
STRUCT_FOR_STR(type_params, ".type_params")
5781
STRUCT_FOR_STR(utf_8, "utf-8")
5882
} literals;

Include/internal/pycore_runtime_init_generated.h

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_unicodeobject_generated.h

Lines changed: 96 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/test/test_tokenize.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,45 @@ def test_exact_type(self):
18851885
token.NAME, token.AMPER, token.NUMBER,
18861886
token.RPAR)
18871887

1888+
def test_exact_operator_token_strings_are_reused(self):
1889+
operators = (
1890+
'...', '->', '!=', '%=', '&=', '**', '**=', '*=',
1891+
'+=', '-=', '//', '//=', '/=', ':=', '<<', '<<=',
1892+
'<=', '==', '>=', '>>', '>>=', '@=', '^=', '|=',
1893+
)
1894+
for op in operators:
1895+
with self.subTest(op=op):
1896+
source = f'{op} {op}\n'.encode()
1897+
tokens = list(tokenize.tokenize(BytesIO(source).readline))
1898+
matches = [tok.string for tok in tokens if tok.string == op]
1899+
self.assertEqual(len(matches), 2)
1900+
self.assertIs(matches[0], matches[1])
1901+
1902+
def test_exact_operator_token_strings_are_reused_by_c_tokenizer(self):
1903+
operators = (
1904+
'...', '->', '!=', '%=', '&=', '**', '**=', '*=',
1905+
'+=', '-=', '//', '//=', '/=', ':=', '<<', '<<=',
1906+
'<=', '==', '>=', '>>', '>>=', '@=', '^=', '|=',
1907+
)
1908+
for op in operators:
1909+
with self.subTest(op=op):
1910+
source = BytesIO(f'{op} {op}\n'.encode())
1911+
tokens = list(tokenize._tokenize.TokenizerIter(
1912+
source.readline, encoding='utf-8', extra_tokens=True))
1913+
matches = [tok[1] for tok in tokens if tok[1] == op]
1914+
self.assertEqual(len(matches), 2)
1915+
self.assertIs(matches[0], matches[1])
1916+
1917+
def test_old_not_equal_spelling_is_not_rewritten(self):
1918+
source = BytesIO(
1919+
b'from __future__ import barry_as_FLUFL\n'
1920+
b'a <> b\n'
1921+
)
1922+
tokens = list(tokenize._tokenize.TokenizerIter(
1923+
source.readline, encoding='utf-8', extra_tokens=True))
1924+
self.assertIn('<>', [tok[1] for tok in tokens])
1925+
self.assertNotIn('!=', [tok[1] for tok in tokens])
1926+
18881927
def test_pathological_trailing_whitespace(self):
18891928
# See http://bugs.python.org/issue16152
18901929
self.assertExactTypeEqual('@ ', token.AT)

Python/Python-tokenize.c

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,38 @@
11
#include "Python.h"
22
#include "errcode.h"
33
#include "internal/pycore_critical_section.h" // Py_BEGIN_CRITICAL_SECTION
4+
#include "internal/pycore_global_strings.h" // _Py_DECLARE_STR()
45
#include "internal/pycore_tuple.h" // _PyTuple_FromPair
56
#include "../Parser/lexer/state.h"
67
#include "../Parser/lexer/lexer.h"
78
#include "../Parser/tokenizer/tokenizer.h"
89
#include "../Parser/pegen.h" // _PyPegen_byte_offset_to_character_offset()
910

11+
_Py_DECLARE_STR(token_atequal, "@=")
12+
_Py_DECLARE_STR(token_circumflexequal, "^=")
13+
_Py_DECLARE_STR(token_colonequal, ":=")
14+
_Py_DECLARE_STR(token_double_slash, "//")
15+
_Py_DECLARE_STR(token_double_slashequal, "//=")
16+
_Py_DECLARE_STR(token_doublestar, "**")
17+
_Py_DECLARE_STR(token_doublestarequal, "**=")
18+
_Py_DECLARE_STR(token_ellipsis, "...")
19+
_Py_DECLARE_STR(token_eqequal, "==")
20+
_Py_DECLARE_STR(token_greaterequal, ">=")
21+
_Py_DECLARE_STR(token_leftshift, "<<")
22+
_Py_DECLARE_STR(token_leftshiftequal, "<<=")
23+
_Py_DECLARE_STR(token_lessequal, "<=")
24+
_Py_DECLARE_STR(token_minequal, "-=")
25+
_Py_DECLARE_STR(token_notequal, "!=")
26+
_Py_DECLARE_STR(token_percentequal, "%=")
27+
_Py_DECLARE_STR(token_plusequal, "+=")
28+
_Py_DECLARE_STR(token_rarrow, "->")
29+
_Py_DECLARE_STR(token_rightshift, ">>")
30+
_Py_DECLARE_STR(token_rightshiftequal, ">>=")
31+
_Py_DECLARE_STR(token_slashequal, "/=")
32+
_Py_DECLARE_STR(token_starequal, "*=")
33+
_Py_DECLARE_STR(token_vbarequal, "|=")
34+
_Py_DECLARE_STR(token_amperequal, "&=")
35+
1036
static struct PyModuleDef _tokenizemodule;
1137

1238
typedef struct {
@@ -42,6 +68,49 @@ typedef struct
4268
Py_ssize_t byte_col_offset_diff;
4369
} tokenizeriterobject;
4470

71+
static PyObject *
72+
get_static_exact_token_str(int type, const char *start, Py_ssize_t len)
73+
{
74+
#define RETURN_STATIC_TOKEN_STR(TYPE, NAME, LITERAL) \
75+
case TYPE: \
76+
if (len == (Py_ssize_t)(sizeof(LITERAL) - 1) \
77+
&& memcmp(start, LITERAL, sizeof(LITERAL) - 1) == 0) \
78+
{ \
79+
return Py_NewRef(&_Py_STR(NAME)); \
80+
} \
81+
break
82+
83+
switch (type) {
84+
RETURN_STATIC_TOKEN_STR(ATEQUAL, token_atequal, "@=");
85+
RETURN_STATIC_TOKEN_STR(CIRCUMFLEXEQUAL, token_circumflexequal, "^=");
86+
RETURN_STATIC_TOKEN_STR(COLONEQUAL, token_colonequal, ":=");
87+
RETURN_STATIC_TOKEN_STR(DOUBLESLASH, token_double_slash, "//");
88+
RETURN_STATIC_TOKEN_STR(DOUBLESLASHEQUAL, token_double_slashequal, "//=");
89+
RETURN_STATIC_TOKEN_STR(DOUBLESTAR, token_doublestar, "**");
90+
RETURN_STATIC_TOKEN_STR(DOUBLESTAREQUAL, token_doublestarequal, "**=");
91+
RETURN_STATIC_TOKEN_STR(ELLIPSIS, token_ellipsis, "...");
92+
RETURN_STATIC_TOKEN_STR(EQEQUAL, token_eqequal, "==");
93+
RETURN_STATIC_TOKEN_STR(GREATEREQUAL, token_greaterequal, ">=");
94+
RETURN_STATIC_TOKEN_STR(LEFTSHIFT, token_leftshift, "<<");
95+
RETURN_STATIC_TOKEN_STR(LEFTSHIFTEQUAL, token_leftshiftequal, "<<=");
96+
RETURN_STATIC_TOKEN_STR(LESSEQUAL, token_lessequal, "<=");
97+
RETURN_STATIC_TOKEN_STR(MINEQUAL, token_minequal, "-=");
98+
RETURN_STATIC_TOKEN_STR(NOTEQUAL, token_notequal, "!=");
99+
RETURN_STATIC_TOKEN_STR(PERCENTEQUAL, token_percentequal, "%=");
100+
RETURN_STATIC_TOKEN_STR(PLUSEQUAL, token_plusequal, "+=");
101+
RETURN_STATIC_TOKEN_STR(RARROW, token_rarrow, "->");
102+
RETURN_STATIC_TOKEN_STR(RIGHTSHIFT, token_rightshift, ">>");
103+
RETURN_STATIC_TOKEN_STR(RIGHTSHIFTEQUAL, token_rightshiftequal, ">>=");
104+
RETURN_STATIC_TOKEN_STR(SLASHEQUAL, token_slashequal, "/=");
105+
RETURN_STATIC_TOKEN_STR(STAREQUAL, token_starequal, "*=");
106+
RETURN_STATIC_TOKEN_STR(VBAREQUAL, token_vbarequal, "|=");
107+
RETURN_STATIC_TOKEN_STR(AMPEREQUAL, token_amperequal, "&=");
108+
}
109+
110+
#undef RETURN_STATIC_TOKEN_STR
111+
return NULL;
112+
}
113+
45114
/*[clinic input]
46115
@classmethod
47116
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
@@ -268,7 +337,11 @@ tokenizeriter_next(PyObject *op)
268337
str = Py_GetConstant(Py_CONSTANT_EMPTY_STR);
269338
}
270339
else {
271-
str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
340+
Py_ssize_t len = token.end - token.start;
341+
str = get_static_exact_token_str(type, token.start, len);
342+
if (str == NULL) {
343+
str = PyUnicode_FromStringAndSize(token.start, len);
344+
}
272345
}
273346
if (str == NULL) {
274347
goto exit;

0 commit comments

Comments
 (0)