gh-151763: Fix OOM-0034 tokenizer offset error handling

zainnadeem786 · zainnadeem786 · commit 82b78b0be9eb · 2026-06-20T18:24:04.000+05:00
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
@@ -11,7 +11,7 @@
 from textwrap import dedent
 from unittest import TestCase, mock
 from test import support
-from test.support import os_helper
+from test.support import import_helper, os_helper
 from test.support.script_helper import run_test_script, make_script, run_python_until_end
 from test.support.numbers import (
     VALID_UNDERSCORE_LITERALS,
@@ -2266,6 +2266,67 @@ def readline(encoding):
                 ))
                 self.assertEqual(tokens, expected)
 
+    @unittest.skipIf(support.Py_TRACE_REFS,
+                     '_testcapi.set_nomemory() is unreliable with Py_TRACE_REFS')
+    def test_col_offset_conversion_oom(self):
+        import_helper.import_module('_testcapi')
+        code = dedent(r"""
+            import _testcapi
+            import _tokenize
+
+            def check_indented_name(start):
+                source = "if True:\n  \u00e9 = 1\n"
+                it = _tokenize.TokenizerIter(
+                    iter(source.splitlines(True)).__next__,
+                    extra_tokens=False,
+                )
+                for _ in range(5):
+                    next(it)
+
+                _testcapi.set_nomemory(start, start + 1)
+                try:
+                    next(it)
+                except MemoryError:
+                    return True
+                finally:
+                    _testcapi.remove_mem_hooks()
+                return False
+
+            def check_multiline_string(start):
+                source = "x = '''abc\ndef'''\n"
+                it = _tokenize.TokenizerIter(
+                    iter(source.splitlines(True)).__next__,
+                    extra_tokens=False,
+                )
+                next(it)
+                next(it)
+
+                _testcapi.set_nomemory(start, start + 1)
+                try:
+                    next(it)
+                except MemoryError:
+                    return True
+                finally:
+                    _testcapi.remove_mem_hooks()
+                return False
+
+            def check_range(name, func):
+                seen_memory_error = False
+                for index in range(20):
+                    if func(index):
+                        seen_memory_error = True
+                if not seen_memory_error:
+                    raise AssertionError(f"{name}: MemoryError not raised")
+
+            check_range("line", check_indented_name)
+            check_range("raw", check_multiline_string)
+            print("MemoryError")
+        """)
+        with support.SuppressCrashReport():
+            res, _ = run_python_until_end("-c", code)
+        self.assertEqual(res.rc, 0, res.err.decode("ascii", "replace"))
+        self.assertIn(b"MemoryError", res.out)
+
     def test_int(self):
 
         self.check_tokenize('0xff <= 255', """\
diff --git a/Misc/NEWS.d/next/Library/2026-06-20-18-21-28.gh-issue-151763.OOM0034.rst b/Misc/NEWS.d/next/Library/2026-06-20-18-21-28.gh-issue-151763.OOM0034.rst
@@ -0,0 +1,4 @@
+Fix a possible crash in ``_tokenize.TokenizerIter`` when memory allocation
+fails while converting byte offsets to character offsets for non-ASCII source
+lines. The tokenizer now correctly propagates ``MemoryError`` instead of
+dereferencing a NULL pointer or returning a result with an exception set.
diff --git a/Parser/pegen.c b/Parser/pegen.c
@@ -27,6 +27,9 @@ Py_ssize_t
 _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
 {
     const unsigned char *data = (const unsigned char*)PyUnicode_AsUTF8(line);
+    if (data == NULL) {
+        return -1;
+    }
 
     Py_ssize_t len = 0;
     while (col_offset < end_col_offset) {
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -202,21 +202,27 @@ _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t si
     return line;
 }
 
-static void
+static int
 _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start,
                  PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno,
                  Py_ssize_t *col_offset, Py_ssize_t *end_col_offset)
 {
     _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it);
     Py_ssize_t byte_offset = -1;
+    Py_ssize_t byte_col_offset_diff = it->byte_col_offset_diff;
     if (token.start != NULL && token.start >= line_start) {
         byte_offset = token.start - line_start;
         if (line_changed) {
-            *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset);
-            it->byte_col_offset_diff = byte_offset - *col_offset;
+            Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_line(
+                line, 0, byte_offset);
+            if (offset < 0) {
+                return -1;
+            }
+            *col_offset = offset;
+            byte_col_offset_diff = byte_offset - *col_offset;
         }
         else {
-            *col_offset = byte_offset - it->byte_col_offset_diff;
+            *col_offset = byte_offset - byte_col_offset_diff;
         }
     }
 
@@ -226,17 +232,28 @@ _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_s
             // If the whole token is at the same line, we can just use the token.start
             // buffer for figuring out the new column offset, since using line is not
             // performant for very long lines.
-            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset);
+            Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(
+                line, byte_offset, end_byte_offset);
+            if (token_col_offset < 0) {
+                return -1;
+            }
             *end_col_offset = *col_offset + token_col_offset;
-            it->byte_col_offset_diff += token.end - token.start - token_col_offset;
+            byte_col_offset_diff += token.end - token.start - token_col_offset;
         }
         else {
-            *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset);
-            it->byte_col_offset_diff += end_byte_offset - *end_col_offset;
+            Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_raw(
+                it->tok->line_start, end_byte_offset);
+            if (offset < 0) {
+                return -1;
+            }
+            *end_col_offset = offset;
+            byte_col_offset_diff += end_byte_offset - *end_col_offset;
         }
     }
+    it->byte_col_offset_diff = byte_col_offset_diff;
     it->last_lineno = lineno;
     it->last_end_lineno = end_lineno;
+    return 0;
 }
 
 static PyObject *
@@ -301,8 +318,11 @@ tokenizeriter_next(PyObject *op)
     Py_ssize_t end_lineno = it->tok->lineno;
     Py_ssize_t col_offset = -1;
     Py_ssize_t end_col_offset = -1;
-    _get_col_offsets(it, token, line_start, line, line_changed,
-                     lineno, end_lineno, &col_offset, &end_col_offset);
+    if (_get_col_offsets(it, token, line_start, line, line_changed,
+                         lineno, end_lineno, &col_offset, &end_col_offset) < 0) {
+        Py_DECREF(str);
+        goto exit;
+    }
 
     if (it->tok->tok_extra_tokens) {
         if (is_trailing_token) {

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,9 @@ Py_ssize_t`
`27`	`27`	`_PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)`
`28`	`28`	`{`
`29`	`29`	`const unsigned char data = (const unsigned char)PyUnicode_AsUTF8(line);`
	`30`	`+ if (data == NULL) {`
	`31`	`+ return -1;`
	`32`	`+ }`
`30`	`33`
`31`	`34`	`Py_ssize_t len = 0;`
`32`	`35`	`while (col_offset < end_col_offset) {`