88import re
99from typing import Dict , List
1010
11- __all__ = ["Post" , "parse_inline" ]
11+ __all__ = ["Post" , "parse_inline" , "tokens_to_text_nodes" ]
1212
1313from substack .exceptions import SectionNotExistsException
1414
1515
16+ def tokens_to_text_nodes (tokens : List [Dict ]) -> List [Dict ]:
17+ """Convert parse_inline() tokens to ProseMirror text nodes.
18+
19+ parse_inline() returns {"content": "text", "marks": [...]}.
20+ ProseMirror expects {"type": "text", "text": "text", "marks": [...]}.
21+ """
22+ nodes = []
23+ for token in tokens :
24+ if not token or not token .get ("content" ):
25+ continue
26+ node = {"type" : "text" , "text" : token ["content" ]}
27+ marks = token .get ("marks" )
28+ if marks :
29+ node ["marks" ] = marks
30+ nodes .append (node )
31+ return nodes
32+
33+
1634def parse_inline (text : str ) -> List [Dict ]:
1735 """
1836 Convert inline Markdown in a text string into a list of tokens
1937 for use in the post content.
2038
2139 Supported formatting:
40+ - `code`: Text wrapped in backticks.
2241 - **Bold**: Text wrapped in double asterisks.
2342 - *Italic*: Text wrapped in single asterisks.
43+ - ***Bold+Italic***: Text wrapped in triple asterisks.
44+ - ~~Strikethrough~~: Text wrapped in double tildes.
2445 - [Links]: Text wrapped in square brackets followed by URL in parentheses.
2546
2647 Args:
@@ -37,33 +58,50 @@ def parse_inline(text: str) -> List[Dict]:
3758 return []
3859
3960 tokens = []
40- # Process text character by character to handle nested formatting
41- # We'll use regex to find all markdown patterns, then process them in order
4261
43- # Find all markdown patterns: links, bold, italic
44- # Pattern order: links first (to avoid conflicts), then bold, then italic
62+ # Pattern order matters: code > links > bold+italic > bold > italic > strikethrough
63+ code_pattern = r'`([^`]+)`'
4564 link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
65+ bold_italic_pattern = r'\*\*\*([^*]+)\*\*\*'
4666 bold_pattern = r'\*\*([^*]+)\*\*'
4767 italic_pattern = r'(?<!\*)\*([^*]+)\*(?!\*)' # Not preceded or followed by *
68+ strikethrough_pattern = r'~~([^~]+)~~'
4869
4970 # Find all matches with their positions
5071 matches = []
72+
73+ # Inline code FIRST -- content inside backticks must not be parsed for other formatting
74+ for match in re .finditer (code_pattern , text ):
75+ matches .append ((match .start (), match .end (), "code" , match .group (1 ), None ))
76+
77+ # Links
5178 for match in re .finditer (link_pattern , text ):
5279 # Skip if it's an image link (starts with ![)
5380 # But do NOT skip normal links at position 0.
5481 if match .start () == 0 or text [match .start ()- 1 :match .start ()+ 1 ] != "![" :
55- matches .append ((match .start (), match .end (), "link" , match .group (1 ), match .group (2 )))
82+ if not any (start <= match .start () < end for start , end , _ , _ , _ in matches ):
83+ matches .append ((match .start (), match .end (), "link" , match .group (1 ), match .group (2 )))
84+
85+ # Bold+italic combo
86+ for match in re .finditer (bold_italic_pattern , text ):
87+ if not any (start <= match .start () < end for start , end , _ , _ , _ in matches ):
88+ matches .append ((match .start (), match .end (), "bold_italic" , match .group (1 ), None ))
5689
90+ # Bold
5791 for match in re .finditer (bold_pattern , text ):
58- # Check if this range is already covered by a link
5992 if not any (start <= match .start () < end for start , end , _ , _ , _ in matches ):
6093 matches .append ((match .start (), match .end (), "bold" , match .group (1 ), None ))
6194
95+ # Italic
6296 for match in re .finditer (italic_pattern , text ):
63- # Check if this range is already covered by a link or bold
6497 if not any (start <= match .start () < end for start , end , _ , _ , _ in matches ):
6598 matches .append ((match .start (), match .end (), "italic" , match .group (1 ), None ))
6699
100+ # Strikethrough
101+ for match in re .finditer (strikethrough_pattern , text ):
102+ if not any (start <= match .start () < end for start , end , _ , _ , _ in matches ):
103+ matches .append ((match .start (), match .end (), "strikethrough" , match .group (1 ), None ))
104+
67105 # Sort matches by position
68106 matches .sort (key = lambda x : x [0 ])
69107
@@ -75,11 +113,21 @@ def parse_inline(text: str) -> List[Dict]:
75113 tokens .append ({"content" : text [last_pos :start ]})
76114
77115 # Add the formatted content
78- if match_type == "link" :
116+ if match_type == "code" :
117+ tokens .append ({
118+ "content" : content ,
119+ "marks" : [{"type" : "code" }]
120+ })
121+ elif match_type == "link" :
79122 tokens .append ({
80123 "content" : content ,
81124 "marks" : [{"type" : "link" , "attrs" : {"href" : url }}]
82125 })
126+ elif match_type == "bold_italic" :
127+ tokens .append ({
128+ "content" : content ,
129+ "marks" : [{"type" : "strong" }, {"type" : "em" }]
130+ })
83131 elif match_type == "bold" :
84132 tokens .append ({
85133 "content" : content ,
@@ -90,6 +138,11 @@ def parse_inline(text: str) -> List[Dict]:
90138 "content" : content ,
91139 "marks" : [{"type" : "em" }]
92140 })
141+ elif match_type == "strikethrough" :
142+ tokens .append ({
143+ "content" : content ,
144+ "marks" : [{"type" : "strikethrough" }]
145+ })
93146
94147 last_pos = end
95148
@@ -503,7 +556,9 @@ def from_markdown(self, markdown_content: str, api=None):
503556 - Blockquotes: Lines starting with '>' (consecutive lines grouped)
504557 - Paragraphs: Regular text blocks
505558 - Bullet lists: Lines starting with '*' or '-'
506- - Inline formatting: **bold** and *italic* within paragraphs
559+ - Ordered lists: Lines starting with '1.', '2.', etc.
560+ - Horizontal rules: Lines with ---, ***, or ___
561+ - Inline formatting: **bold**, *italic*, ***bold+italic***, `code`, ~~strikethrough~~
507562
508563 Args:
509564 markdown_content: Markdown string to parse and add to the post.
@@ -593,6 +648,11 @@ def from_markdown(self, markdown_content: str, api=None):
593648 if not text_content :
594649 continue
595650
651+ # Check for horizontal rule: ---, ***, ___
652+ if re .match (r'^(\*{3,}|-{3,}|_{3,})\s*$' , text_content ):
653+ self .horizontal_rule ()
654+ continue
655+
596656 # Process headings (lines starting with '#' characters)
597657 if text_content .startswith ("#" ):
598658 level = len (text_content ) - len (text_content .lstrip ("#" ))
@@ -648,14 +708,15 @@ def from_markdown(self, markdown_content: str, api=None):
648708
649709 self .add ({"type" : "captionedImage" , "src" : image_url })
650710
651- # Process paragraphs, bullet lists, or blockquotes
711+ # Process paragraphs, bullet lists, ordered lists, or blockquotes
652712 else :
653713 if "\n " in text_content :
654- # Process each line, grouping consecutive bullets
655- # into a single bullet_list node and consecutive
656- # blockquote lines into a single blockquote node.
714+ # Process each line, grouping consecutive bullets/ordered items
715+ # into list nodes and consecutive blockquote lines into a
716+ # single blockquote node.
657717 pending_bullets : List [List [Dict ]] = []
658718 pending_quotes : List [str ] = []
719+ pending_ordered : List [List [Dict ]] = []
659720
660721 def flush_bullets ():
661722 if not pending_bullets :
@@ -677,10 +738,7 @@ def flush_quotes():
677738 paragraphs : List [Dict ] = []
678739 for quote_line in pending_quotes :
679740 tokens = parse_inline (quote_line )
680- text_nodes = [
681- {"type" : "text" , "text" : t ["content" ]}
682- for t in tokens if t
683- ]
741+ text_nodes = tokens_to_text_nodes (tokens )
684742 if text_nodes :
685743 paragraphs .append ({"type" : "paragraph" , "content" : text_nodes })
686744 node : Dict = {"type" : "blockquote" }
@@ -689,20 +747,48 @@ def flush_quotes():
689747 self .draft_body ["content" ].append (node )
690748 pending_quotes .clear ()
691749
750+ def flush_ordered ():
751+ if not pending_ordered :
752+ return
753+ list_items = []
754+ for item_nodes in pending_ordered :
755+ list_items .append ({
756+ "type" : "list_item" ,
757+ "content" : [{"type" : "paragraph" , "content" : item_nodes }],
758+ })
759+ self .draft_body ["content" ].append (
760+ {"type" : "ordered_list" , "content" : list_items }
761+ )
762+ pending_ordered .clear ()
763+
692764 for line in text_content .split ("\n " ):
693765 line = line .strip ()
694766 if not line :
695767 flush_bullets ()
768+ flush_ordered ()
696769 flush_quotes ()
697770 continue
698771
699772 # Check for blockquote marker
700773 if line .startswith ("> " ) or line == ">" :
701774 flush_bullets ()
775+ flush_ordered ()
702776 quote_text = line [2 :] if line .startswith ("> " ) else ""
703777 pending_quotes .append (quote_text )
704778 continue
705779
780+ # Check for ordered list marker
781+ ordered_match = re .match (r'^(\d+)\.\s+(.*)' , line )
782+ if ordered_match :
783+ flush_bullets ()
784+ flush_quotes ()
785+ item_text = ordered_match .group (2 ).strip ()
786+ tokens = parse_inline (item_text )
787+ text_nodes = tokens_to_text_nodes (tokens )
788+ if text_nodes :
789+ pending_ordered .append (text_nodes )
790+ continue
791+
706792 # Check for bullet marker
707793 bullet_text = None
708794 if line .startswith ("* " ):
@@ -713,31 +799,47 @@ def flush_quotes():
713799 bullet_text = line [1 :].strip ()
714800
715801 if bullet_text is not None :
802+ flush_ordered ()
716803 flush_quotes ()
717804 tokens = parse_inline (bullet_text )
718- if tokens :
719- pending_bullets .append (tokens )
805+ text_nodes = tokens_to_text_nodes (tokens )
806+ if text_nodes :
807+ pending_bullets .append (text_nodes )
720808 else :
721809 flush_bullets ()
810+ flush_ordered ()
722811 flush_quotes ()
723812 tokens = parse_inline (line )
724813 self .add ({"type" : "paragraph" , "content" : tokens })
725814
726815 flush_bullets ()
816+ flush_ordered ()
727817 flush_quotes ()
728818 else :
729- # Single line — could be a blockquote or paragraph
819+ # Single line — blockquote, ordered list, or paragraph
730820 if text_content .startswith ("> " ) or text_content == ">" :
731821 quote_text = text_content [2 :] if text_content .startswith ("> " ) else ""
732822 tokens = parse_inline (quote_text )
733- text_nodes = [
734- {"type" : "text" , "text" : t ["content" ]}
735- for t in tokens if t
736- ]
823+ text_nodes = tokens_to_text_nodes (tokens )
737824 para = {"type" : "paragraph" , "content" : text_nodes } if text_nodes else {"type" : "paragraph" }
738825 self .draft_body ["content" ] = self .draft_body .get ("content" , []) + [
739826 {"type" : "blockquote" , "content" : [para ]}
740827 ]
828+
829+ elif re .match (r'^(\d+)\.\s+(.*)' , text_content ):
830+ ordered_match = re .match (r'^(\d+)\.\s+(.*)' , text_content )
831+ item_text = ordered_match .group (2 ).strip ()
832+ tokens = parse_inline (item_text )
833+ text_nodes = tokens_to_text_nodes (tokens )
834+ if text_nodes :
835+ list_item = {
836+ "type" : "list_item" ,
837+ "content" : [{"type" : "paragraph" , "content" : text_nodes }],
838+ }
839+ self .draft_body ["content" ].append (
840+ {"type" : "ordered_list" , "content" : [list_item ]}
841+ )
842+
741843 else :
742844 tokens = parse_inline (text_content )
743845 self .add ({"type" : "paragraph" , "content" : tokens })
0 commit comments