From 153a05e6c633cd197a6d5d4a36c43119059da16f Mon Sep 17 00:00:00 2001 From: Yuxuan Chen Date: Thu, 4 Jun 2026 16:56:25 -0400 Subject: [PATCH 1/3] fix: escape invalid backslash sequences in generated docstrings --- .../codegen/writer/MarkdownConverter.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java b/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java index 30e2e003b..42cccc185 100644 --- a/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java +++ b/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java @@ -168,9 +168,20 @@ private static String postProcessPandocOutput(String output) { // Remove empty lines at the start and end output = output.trim(); - // Remove unnecessary backslash escapes that pandoc adds for markdown - // These characters don't need escaping in Python docstrings - output = output.replaceAll("\\\\([\\[\\]'{}()<>`@_*|!~$#^])", "$1"); + // Remove unnecessary backslash escapes that pandoc adds for markdown. + // These characters don't need escaping in Python docstrings. The + // negative lookbehind ensures we only strip a single, spurious escape + // and never consume one half of a literal "\\" (which is a valid Python + // escape and must be preserved, e.g. a charset like "[\\]"). + output = output.replaceAll("(?`@_*|!~$#^])", "$1"); + + // Escape any remaining lone backslash that does not form a valid Python + // escape sequence, so the docstring is a valid Python string. A backslash + // is left untouched when it begins a recognized Python escape (quote, + // letter escapes, octal, hex, Unicode, or a line continuation) and is + // doubled otherwise (e.g. a charset like a lone backslash between spaces). + // See https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences + output = output.replaceAll("(? and tags with admonitions for mkdocstrings output = replaceAdmonitionTags(output, "note", "Note"); From 8b30624d7fda66daf1c2090e7c71a51f703d4c76 Mon Sep 17 00:00:00 2001 From: Yuxuan Chen Date: Fri, 5 Jun 2026 10:51:56 -0400 Subject: [PATCH 2/3] test: add backslash escape regression tests for generated docstrings --- .../codegen/writer/MarkdownConverterTest.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java b/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java index 028ec1aaf..d379ea187 100644 --- a/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java +++ b/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java @@ -128,6 +128,34 @@ public void testConvertRemovesUnnecessaryBackslashEscapes() { assertEquals("Text with [brackets] and {braces} and (parens)", result.trim()); } + @Test + public void testConvertPreservesLiteralDoubleBackslash() { + // A literal "\\" is a valid Python escape (e.g. a password charset like + // "[\\]") and must be preserved. + String html = "@[\\\\]^"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`@[\\\\]^`", result); + } + + @Test + public void testConvertEscapesLoneBackslash() { + // A lone backslash that is not part of a recognized Python escape (here a + // backslash followed by a space) must be doubled so the docstring is a + // valid Python string. + String html = "[ \\ ]"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`[ \\\\ ]`", result); + } + + @Test + public void testConvertPreservesValidPythonEscapes() { + // A backslash that already forms a valid Python escape (e.g. \") must be + // left untouched rather than doubled. + String html = "!\\\"#"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`!\\\"#`", result); + } + @Test public void testConvertMixedElements() { String html = "

Title

Paragraph

  • Item 1
  • Item 2
"; From bba6f2356fd57ffe31bbc61fde085f09cec49f34 Mon Sep 17 00:00:00 2001 From: Yuxuan Chen Date: Fri, 5 Jun 2026 16:44:31 -0400 Subject: [PATCH 3/3] fix: normalize odd-length backslash runs in generated docstrings --- .../codegen/writer/MarkdownConverter.java | 56 ++++++++++++++----- .../codegen/writer/MarkdownConverterTest.java | 20 +++++++ 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java b/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java index 42cccc185..bdab88b4f 100644 --- a/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java +++ b/codegen/core/src/main/java/software/amazon/smithy/python/codegen/writer/MarkdownConverter.java @@ -168,20 +168,9 @@ private static String postProcessPandocOutput(String output) { // Remove empty lines at the start and end output = output.trim(); - // Remove unnecessary backslash escapes that pandoc adds for markdown. - // These characters don't need escaping in Python docstrings. The - // negative lookbehind ensures we only strip a single, spurious escape - // and never consume one half of a literal "\\" (which is a valid Python - // escape and must be preserved, e.g. a charset like "[\\]"). - output = output.replaceAll("(?`@_*|!~$#^])", "$1"); - - // Escape any remaining lone backslash that does not form a valid Python - // escape sequence, so the docstring is a valid Python string. A backslash - // is left untouched when it begins a recognized Python escape (quote, - // letter escapes, octal, hex, Unicode, or a line continuation) and is - // doubled otherwise (e.g. a charset like a lone backslash between spaces). - // See https://docs.python.org/3/reference/lexical_analysis.html#escape-sequences - output = output.replaceAll("(? and tags with admonitions for mkdocstrings output = replaceAdmonitionTags(output, "note", "Note"); @@ -191,6 +180,45 @@ private static String postProcessPandocOutput(String output) { return output.replace("$", "$$"); } + // A lone backslash before one of these is dropped: pandoc adds it for Markdown + // but it needs no escaping in a Python docstring. + private static final String MARKDOWN_ONLY_CHARS = "[](){}<>`@_*|!~$#^'."; + // A lone backslash before one of these is kept: together they form a valid + // Python escape. (' is intentionally left out; it lives in MARKDOWN_ONLY_CHARS.) + private static final String VALID_ESCAPE_CHARS = "\\\"abfnrtv01234567xNuU\r\n"; + private static final Pattern BACKSLASH_RUN = Pattern.compile("(\\\\+)([^\\\\]|$)", Pattern.DOTALL); + + /** + * Fixes pandoc's backslash escaping so the docstring is a valid Python literal. + * + *

Backslashes must come in pairs, but pandoc can leave an odd-length run + * (it escapes literal backslashes and Markdown characters separately, and + * adjacent escapes pile up). For each run we keep the pairs; a leftover + * backslash is then kept if it forms a valid escape, dropped if it is only + * there for Markdown, or doubled otherwise. + */ + private static String normalizeBackslashEscapes(String output) { + Matcher m = BACKSLASH_RUN.matcher(output); + StringBuilder sb = new StringBuilder(); + while (m.find()) { + int runLength = m.group(1).length(); + String next = m.group(2); + int backslashes = (runLength / 2) * 2; + if (runLength % 2 != 0) { + char c = next.isEmpty() ? '\0' : next.charAt(0); + if (!next.isEmpty() && VALID_ESCAPE_CHARS.indexOf(c) >= 0) { + backslashes += 1; + } else if (next.isEmpty() || MARKDOWN_ONLY_CHARS.indexOf(c) < 0) { + backslashes += 2; + } + // else: Markdown-only char, drop the spurious backslash + } + m.appendReplacement(sb, Matcher.quoteReplacement("\\".repeat(backslashes) + next)); + } + m.appendTail(sb); + return sb.toString(); + } + /** * Replaces admonition tags (e.g. note, important) with Google-style format. * diff --git a/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java b/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java index d379ea187..d425f727a 100644 --- a/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java +++ b/codegen/core/src/test/java/software/amazon/smithy/python/codegen/writer/MarkdownConverterTest.java @@ -156,6 +156,26 @@ public void testConvertPreservesValidPythonEscapes() { assertEquals("`!\\\"#`", result); } + @Test + public void testConvertHandlesBackslashBeforeMarkdownChar() { + // A backslash followed by a Markdown-significant character (here "*") makes + // pandoc emit an odd-length backslash run. The spurious escape must be + // dropped so the result is valid Python. + String html = "arn:aws:iam::\\*:user/\\*"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("`arn:aws:iam::*:user/*`", result); + } + + @Test + public void testConvertHandlesBackslashStarNextToQuote() { + // The API Gateway model documents the comment marker "\*/". Next to a quote, + // pandoc emits an odd-length backslash run; the result must stay a valid + // Python literal. + String html = "

Do not include \"\\*/\" characters

"; + String result = MarkdownConverter.convert(html, createMockContext(true)).trim(); + assertEquals("Do not include \\\"\\\\*/\\\" characters", result); + } + @Test public void testConvertMixedElements() { String html = "

Title

Paragraph

  • Item 1
  • Item 2
";