Skip to content

Commit 82b78b0

Browse files
committed
gh-151763: Fix OOM-0034 tokenizer offset error handling
1 parent aa5b164 commit 82b78b0

4 files changed

Lines changed: 99 additions & 11 deletions

File tree

Lib/test/test_tokenize.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from textwrap import dedent
1212
from unittest import TestCase, mock
1313
from test import support
14-
from test.support import os_helper
14+
from test.support import import_helper, os_helper
1515
from test.support.script_helper import run_test_script, make_script, run_python_until_end
1616
from test.support.numbers import (
1717
VALID_UNDERSCORE_LITERALS,
@@ -2266,6 +2266,67 @@ def readline(encoding):
22662266
))
22672267
self.assertEqual(tokens, expected)
22682268

2269+
@unittest.skipIf(support.Py_TRACE_REFS,
2270+
'_testcapi.set_nomemory() is unreliable with Py_TRACE_REFS')
2271+
def test_col_offset_conversion_oom(self):
2272+
import_helper.import_module('_testcapi')
2273+
code = dedent(r"""
2274+
import _testcapi
2275+
import _tokenize
2276+
2277+
def check_indented_name(start):
2278+
source = "if True:\n \u00e9 = 1\n"
2279+
it = _tokenize.TokenizerIter(
2280+
iter(source.splitlines(True)).__next__,
2281+
extra_tokens=False,
2282+
)
2283+
for _ in range(5):
2284+
next(it)
2285+
2286+
_testcapi.set_nomemory(start, start + 1)
2287+
try:
2288+
next(it)
2289+
except MemoryError:
2290+
return True
2291+
finally:
2292+
_testcapi.remove_mem_hooks()
2293+
return False
2294+
2295+
def check_multiline_string(start):
2296+
source = "x = '''abc\ndef'''\n"
2297+
it = _tokenize.TokenizerIter(
2298+
iter(source.splitlines(True)).__next__,
2299+
extra_tokens=False,
2300+
)
2301+
next(it)
2302+
next(it)
2303+
2304+
_testcapi.set_nomemory(start, start + 1)
2305+
try:
2306+
next(it)
2307+
except MemoryError:
2308+
return True
2309+
finally:
2310+
_testcapi.remove_mem_hooks()
2311+
return False
2312+
2313+
def check_range(name, func):
2314+
seen_memory_error = False
2315+
for index in range(20):
2316+
if func(index):
2317+
seen_memory_error = True
2318+
if not seen_memory_error:
2319+
raise AssertionError(f"{name}: MemoryError not raised")
2320+
2321+
check_range("line", check_indented_name)
2322+
check_range("raw", check_multiline_string)
2323+
print("MemoryError")
2324+
""")
2325+
with support.SuppressCrashReport():
2326+
res, _ = run_python_until_end("-c", code)
2327+
self.assertEqual(res.rc, 0, res.err.decode("ascii", "replace"))
2328+
self.assertIn(b"MemoryError", res.out)
2329+
22692330
def test_int(self):
22702331

22712332
self.check_tokenize('0xff <= 255', """\
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix a possible crash in ``_tokenize.TokenizerIter`` when memory allocation
2+
fails while converting byte offsets to character offsets for non-ASCII source
3+
lines. The tokenizer now correctly propagates ``MemoryError`` instead of
4+
dereferencing a NULL pointer or returning a result with an exception set.

Parser/pegen.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ Py_ssize_t
2727
_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
2828
{
2929
const unsigned char *data = (const unsigned char*)PyUnicode_AsUTF8(line);
30+
if (data == NULL) {
31+
return -1;
32+
}
3033

3134
Py_ssize_t len = 0;
3235
while (col_offset < end_col_offset) {

Python/Python-tokenize.c

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -202,21 +202,27 @@ _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t si
202202
return line;
203203
}
204204

205-
static void
205+
static int
206206
_get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
207207
PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
208208
Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
209209
{
210210
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
211211
Py_ssize_t byte_offset = -1;
212+
Py_ssize_t byte_col_offset_diff = it->byte_col_offset_diff;
212213
if (token.start != NULL && token.start >= line_start) {
213214
byte_offset = token.start - line_start;
214215
if (line_changed) {
215-
*col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
216-
it->byte_col_offset_diff = byte_offset - *col_offset;
216+
Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_line(
217+
line, 0, byte_offset);
218+
if (offset < 0) {
219+
return -1;
220+
}
221+
*col_offset = offset;
222+
byte_col_offset_diff = byte_offset - *col_offset;
217223
}
218224
else {
219-
*col_offset = byte_offset - it->byte_col_offset_diff;
225+
*col_offset = byte_offset - byte_col_offset_diff;
220226
}
221227
}
222228

@@ -226,17 +232,28 @@ _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_s
226232
// If the whole token is at the same line, we can just use the token.start
227233
// buffer for figuring out the new column offset, since using line is not
228234
// performant for very long lines.
229-
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
235+
Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(
236+
line, byte_offset, end_byte_offset);
237+
if (token_col_offset < 0) {
238+
return -1;
239+
}
230240
*end_col_offset = *col_offset + token_col_offset;
231-
it->byte_col_offset_diff += token.end - token.start - token_col_offset;
241+
byte_col_offset_diff += token.end - token.start - token_col_offset;
232242
}
233243
else {
234-
*end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
235-
it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
244+
Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_raw(
245+
it->tok->line_start, end_byte_offset);
246+
if (offset < 0) {
247+
return -1;
248+
}
249+
*end_col_offset = offset;
250+
byte_col_offset_diff += end_byte_offset - *end_col_offset;
236251
}
237252
}
253+
it->byte_col_offset_diff = byte_col_offset_diff;
238254
it->last_lineno = lineno;
239255
it->last_end_lineno = end_lineno;
256+
return 0;
240257
}
241258

242259
static PyObject *
@@ -301,8 +318,11 @@ tokenizeriter_next(PyObject *op)
301318
Py_ssize_t end_lineno = it->tok->lineno;
302319
Py_ssize_t col_offset = -1;
303320
Py_ssize_t end_col_offset = -1;
304-
_get_col_offsets(it, token, line_start, line, line_changed,
305-
lineno, end_lineno, &col_offset, &end_col_offset);
321+
if (_get_col_offsets(it, token, line_start, line, line_changed,
322+
lineno, end_lineno, &col_offset, &end_col_offset) < 0) {
323+
Py_DECREF(str);
324+
goto exit;
325+
}
306326

307327
if (it->tok->tok_extra_tokens) {
308328
if (is_trailing_token) {

0 commit comments

Comments
 (0)