Skip to content

Commit 610f247

Browse files
feat: add inline code, ordered lists, horizontal rules, bold+italic, strikethrough (#47)
* feat: add inline code, ordered lists, horizontal rules, bold+italic, strikethrough Extend parse_inline() with new inline formatting patterns: - Inline code (`code`) with correct pattern ordering to prevent backtick content from being parsed as bold/italic - Bold+italic combo (***text***) matched before bold/italic - Strikethrough (~~text~~) Extend from_markdown() with new block-level elements: - Ordered lists (1. item) mirroring existing bullet list pattern - Horizontal rules (---, ***, ___) using existing horizontal_rule() method Fix parse_inline() token format in list items and blockquotes: - Add tokens_to_text_nodes() helper that converts parse_inline() tokens ({"content": "text"}) to valid ProseMirror text nodes ({"type": "text", "text": "text"}) with marks preserved - Apply to flush_bullets, flush_ordered, flush_quotes, and single-line variants — fixes Substack rendering empty body for lists/blockquotes All features verified against live Substack API. Includes 33 new unit tests covering all additions plus regression tests. * refactor: address PR review comments - Fix duplicate "Post Utilities" in module docstring - Remove numbering from parse_inline section comments - Flatten single-line blockquote/ordered/paragraph branch with elif
1 parent d94cd5f commit 610f247

2 files changed

Lines changed: 433 additions & 25 deletions

File tree

substack/post.py

Lines changed: 127 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,40 @@
88
import re
99
from typing import Dict, List
1010

11-
__all__ = ["Post", "parse_inline"]
11+
__all__ = ["Post", "parse_inline", "tokens_to_text_nodes"]
1212

1313
from substack.exceptions import SectionNotExistsException
1414

1515

16+
def tokens_to_text_nodes(tokens: List[Dict]) -> List[Dict]:
17+
"""Convert parse_inline() tokens to ProseMirror text nodes.
18+
19+
parse_inline() returns {"content": "text", "marks": [...]}.
20+
ProseMirror expects {"type": "text", "text": "text", "marks": [...]}.
21+
"""
22+
nodes = []
23+
for token in tokens:
24+
if not token or not token.get("content"):
25+
continue
26+
node = {"type": "text", "text": token["content"]}
27+
marks = token.get("marks")
28+
if marks:
29+
node["marks"] = marks
30+
nodes.append(node)
31+
return nodes
32+
33+
1634
def parse_inline(text: str) -> List[Dict]:
1735
"""
1836
Convert inline Markdown in a text string into a list of tokens
1937
for use in the post content.
2038
2139
Supported formatting:
40+
- `code`: Text wrapped in backticks.
2241
- **Bold**: Text wrapped in double asterisks.
2342
- *Italic*: Text wrapped in single asterisks.
43+
- ***Bold+Italic***: Text wrapped in triple asterisks.
44+
- ~~Strikethrough~~: Text wrapped in double tildes.
2445
- [Links]: Text wrapped in square brackets followed by URL in parentheses.
2546
2647
Args:
@@ -37,33 +58,50 @@ def parse_inline(text: str) -> List[Dict]:
3758
return []
3859

3960
tokens = []
40-
# Process text character by character to handle nested formatting
41-
# We'll use regex to find all markdown patterns, then process them in order
4261

43-
# Find all markdown patterns: links, bold, italic
44-
# Pattern order: links first (to avoid conflicts), then bold, then italic
62+
# Pattern order matters: code > links > bold+italic > bold > italic > strikethrough
63+
code_pattern = r'`([^`]+)`'
4564
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
65+
bold_italic_pattern = r'\*\*\*([^*]+)\*\*\*'
4666
bold_pattern = r'\*\*([^*]+)\*\*'
4767
italic_pattern = r'(?<!\*)\*([^*]+)\*(?!\*)' # Not preceded or followed by *
68+
strikethrough_pattern = r'~~([^~]+)~~'
4869

4970
# Find all matches with their positions
5071
matches = []
72+
73+
# Inline code FIRST -- content inside backticks must not be parsed for other formatting
74+
for match in re.finditer(code_pattern, text):
75+
matches.append((match.start(), match.end(), "code", match.group(1), None))
76+
77+
# Links
5178
for match in re.finditer(link_pattern, text):
5279
# Skip if it's an image link (starts with ![)
5380
# But do NOT skip normal links at position 0.
5481
if match.start() == 0 or text[match.start()-1:match.start()+1] != "![":
55-
matches.append((match.start(), match.end(), "link", match.group(1), match.group(2)))
82+
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
83+
matches.append((match.start(), match.end(), "link", match.group(1), match.group(2)))
84+
85+
# Bold+italic combo
86+
for match in re.finditer(bold_italic_pattern, text):
87+
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
88+
matches.append((match.start(), match.end(), "bold_italic", match.group(1), None))
5689

90+
# Bold
5791
for match in re.finditer(bold_pattern, text):
58-
# Check if this range is already covered by a link
5992
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
6093
matches.append((match.start(), match.end(), "bold", match.group(1), None))
6194

95+
# Italic
6296
for match in re.finditer(italic_pattern, text):
63-
# Check if this range is already covered by a link or bold
6497
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
6598
matches.append((match.start(), match.end(), "italic", match.group(1), None))
6699

100+
# Strikethrough
101+
for match in re.finditer(strikethrough_pattern, text):
102+
if not any(start <= match.start() < end for start, end, _, _, _ in matches):
103+
matches.append((match.start(), match.end(), "strikethrough", match.group(1), None))
104+
67105
# Sort matches by position
68106
matches.sort(key=lambda x: x[0])
69107

@@ -75,11 +113,21 @@ def parse_inline(text: str) -> List[Dict]:
75113
tokens.append({"content": text[last_pos:start]})
76114

77115
# Add the formatted content
78-
if match_type == "link":
116+
if match_type == "code":
117+
tokens.append({
118+
"content": content,
119+
"marks": [{"type": "code"}]
120+
})
121+
elif match_type == "link":
79122
tokens.append({
80123
"content": content,
81124
"marks": [{"type": "link", "attrs": {"href": url}}]
82125
})
126+
elif match_type == "bold_italic":
127+
tokens.append({
128+
"content": content,
129+
"marks": [{"type": "strong"}, {"type": "em"}]
130+
})
83131
elif match_type == "bold":
84132
tokens.append({
85133
"content": content,
@@ -90,6 +138,11 @@ def parse_inline(text: str) -> List[Dict]:
90138
"content": content,
91139
"marks": [{"type": "em"}]
92140
})
141+
elif match_type == "strikethrough":
142+
tokens.append({
143+
"content": content,
144+
"marks": [{"type": "strikethrough"}]
145+
})
93146

94147
last_pos = end
95148

@@ -503,7 +556,9 @@ def from_markdown(self, markdown_content: str, api=None):
503556
- Blockquotes: Lines starting with '>' (consecutive lines grouped)
504557
- Paragraphs: Regular text blocks
505558
- Bullet lists: Lines starting with '*' or '-'
506-
- Inline formatting: **bold** and *italic* within paragraphs
559+
- Ordered lists: Lines starting with '1.', '2.', etc.
560+
- Horizontal rules: Lines with ---, ***, or ___
561+
- Inline formatting: **bold**, *italic*, ***bold+italic***, `code`, ~~strikethrough~~
507562
508563
Args:
509564
markdown_content: Markdown string to parse and add to the post.
@@ -593,6 +648,11 @@ def from_markdown(self, markdown_content: str, api=None):
593648
if not text_content:
594649
continue
595650

651+
# Check for horizontal rule: ---, ***, ___
652+
if re.match(r'^(\*{3,}|-{3,}|_{3,})\s*$', text_content):
653+
self.horizontal_rule()
654+
continue
655+
596656
# Process headings (lines starting with '#' characters)
597657
if text_content.startswith("#"):
598658
level = len(text_content) - len(text_content.lstrip("#"))
@@ -648,14 +708,15 @@ def from_markdown(self, markdown_content: str, api=None):
648708

649709
self.add({"type": "captionedImage", "src": image_url})
650710

651-
# Process paragraphs, bullet lists, or blockquotes
711+
# Process paragraphs, bullet lists, ordered lists, or blockquotes
652712
else:
653713
if "\n" in text_content:
654-
# Process each line, grouping consecutive bullets
655-
# into a single bullet_list node and consecutive
656-
# blockquote lines into a single blockquote node.
714+
# Process each line, grouping consecutive bullets/ordered items
715+
# into list nodes and consecutive blockquote lines into a
716+
# single blockquote node.
657717
pending_bullets: List[List[Dict]] = []
658718
pending_quotes: List[str] = []
719+
pending_ordered: List[List[Dict]] = []
659720

660721
def flush_bullets():
661722
if not pending_bullets:
@@ -677,10 +738,7 @@ def flush_quotes():
677738
paragraphs: List[Dict] = []
678739
for quote_line in pending_quotes:
679740
tokens = parse_inline(quote_line)
680-
text_nodes = [
681-
{"type": "text", "text": t["content"]}
682-
for t in tokens if t
683-
]
741+
text_nodes = tokens_to_text_nodes(tokens)
684742
if text_nodes:
685743
paragraphs.append({"type": "paragraph", "content": text_nodes})
686744
node: Dict = {"type": "blockquote"}
@@ -689,20 +747,48 @@ def flush_quotes():
689747
self.draft_body["content"].append(node)
690748
pending_quotes.clear()
691749

750+
def flush_ordered():
751+
if not pending_ordered:
752+
return
753+
list_items = []
754+
for item_nodes in pending_ordered:
755+
list_items.append({
756+
"type": "list_item",
757+
"content": [{"type": "paragraph", "content": item_nodes}],
758+
})
759+
self.draft_body["content"].append(
760+
{"type": "ordered_list", "content": list_items}
761+
)
762+
pending_ordered.clear()
763+
692764
for line in text_content.split("\n"):
693765
line = line.strip()
694766
if not line:
695767
flush_bullets()
768+
flush_ordered()
696769
flush_quotes()
697770
continue
698771

699772
# Check for blockquote marker
700773
if line.startswith("> ") or line == ">":
701774
flush_bullets()
775+
flush_ordered()
702776
quote_text = line[2:] if line.startswith("> ") else ""
703777
pending_quotes.append(quote_text)
704778
continue
705779

780+
# Check for ordered list marker
781+
ordered_match = re.match(r'^(\d+)\.\s+(.*)', line)
782+
if ordered_match:
783+
flush_bullets()
784+
flush_quotes()
785+
item_text = ordered_match.group(2).strip()
786+
tokens = parse_inline(item_text)
787+
text_nodes = tokens_to_text_nodes(tokens)
788+
if text_nodes:
789+
pending_ordered.append(text_nodes)
790+
continue
791+
706792
# Check for bullet marker
707793
bullet_text = None
708794
if line.startswith("* "):
@@ -713,31 +799,47 @@ def flush_quotes():
713799
bullet_text = line[1:].strip()
714800

715801
if bullet_text is not None:
802+
flush_ordered()
716803
flush_quotes()
717804
tokens = parse_inline(bullet_text)
718-
if tokens:
719-
pending_bullets.append(tokens)
805+
text_nodes = tokens_to_text_nodes(tokens)
806+
if text_nodes:
807+
pending_bullets.append(text_nodes)
720808
else:
721809
flush_bullets()
810+
flush_ordered()
722811
flush_quotes()
723812
tokens = parse_inline(line)
724813
self.add({"type": "paragraph", "content": tokens})
725814

726815
flush_bullets()
816+
flush_ordered()
727817
flush_quotes()
728818
else:
729-
# Single line — could be a blockquote or paragraph
819+
# Single line — blockquote, ordered list, or paragraph
730820
if text_content.startswith("> ") or text_content == ">":
731821
quote_text = text_content[2:] if text_content.startswith("> ") else ""
732822
tokens = parse_inline(quote_text)
733-
text_nodes = [
734-
{"type": "text", "text": t["content"]}
735-
for t in tokens if t
736-
]
823+
text_nodes = tokens_to_text_nodes(tokens)
737824
para = {"type": "paragraph", "content": text_nodes} if text_nodes else {"type": "paragraph"}
738825
self.draft_body["content"] = self.draft_body.get("content", []) + [
739826
{"type": "blockquote", "content": [para]}
740827
]
828+
829+
elif re.match(r'^(\d+)\.\s+(.*)', text_content):
830+
ordered_match = re.match(r'^(\d+)\.\s+(.*)', text_content)
831+
item_text = ordered_match.group(2).strip()
832+
tokens = parse_inline(item_text)
833+
text_nodes = tokens_to_text_nodes(tokens)
834+
if text_nodes:
835+
list_item = {
836+
"type": "list_item",
837+
"content": [{"type": "paragraph", "content": text_nodes}],
838+
}
839+
self.draft_body["content"].append(
840+
{"type": "ordered_list", "content": [list_item]}
841+
)
842+
741843
else:
742844
tokens = parse_inline(text_content)
743845
self.add({"type": "paragraph", "content": tokens})

0 commit comments

Comments
 (0)