@@ -1115,13 +1115,24 @@ def _handle_text_elements(
11151115 # Check if this is actually a numbered list by examining the numFmt
11161116 is_numbered = self ._is_numbered_list (numid , ilevel )
11171117
1118- li = self ._add_list_item (
1119- doc = doc ,
1120- numid = numid ,
1121- ilevel = ilevel ,
1122- elements = paragraph_elements ,
1123- is_numbered = is_numbered ,
1124- )
1118+ # If there are equations in the list item, handle them specially
1119+ if len (equations ) > 0 :
1120+ li = self ._add_list_item_with_equations (
1121+ doc = doc ,
1122+ numid = numid ,
1123+ ilevel = ilevel ,
1124+ text = text ,
1125+ equations = equations ,
1126+ is_numbered = is_numbered ,
1127+ )
1128+ else :
1129+ li = self ._add_list_item (
1130+ doc = doc ,
1131+ numid = numid ,
1132+ ilevel = ilevel ,
1133+ elements = paragraph_elements ,
1134+ is_numbered = is_numbered ,
1135+ )
11251136 elem_ref .extend (li ) # MUST BE REF!!!
11261137 self ._update_history (p_style_id , p_level , numid , ilevel )
11271138 return elem_ref
@@ -1196,40 +1207,14 @@ def _handle_text_elements(
11961207 parent = self .parents [level - 1 ], content_layer = self .content_layer
11971208 )
11981209 elem_ref .append (inline_equation .get_ref ())
1199- text_tmp = text
1200- for eq in equations :
1201- if len (text_tmp ) == 0 :
1202- break
1203-
1204- split_text_tmp = text_tmp .split (eq .strip (), maxsplit = 1 )
12051210
1206- pre_eq_text = split_text_tmp [0 ]
1207- text_tmp = "" if len (split_text_tmp ) == 1 else split_text_tmp [1 ]
1208-
1209- if len (pre_eq_text ) > 0 :
1210- e1 = doc .add_text (
1211- label = DocItemLabel .TEXT ,
1212- parent = inline_equation ,
1213- text = pre_eq_text ,
1214- content_layer = self .content_layer ,
1215- )
1216- elem_ref .append (e1 .get_ref ())
1217- e2 = doc .add_text (
1218- label = DocItemLabel .FORMULA ,
1219- parent = inline_equation ,
1220- text = eq .replace ("<eq>" , "" ).replace ("</eq>" , "" ),
1221- content_layer = self .content_layer ,
1222- )
1223- elem_ref .append (e2 .get_ref ())
1224-
1225- if len (text_tmp ) > 0 :
1226- e3 = doc .add_text (
1227- label = DocItemLabel .TEXT ,
1228- parent = inline_equation ,
1229- text = text_tmp .strip (),
1230- content_layer = self .content_layer ,
1231- )
1232- elem_ref .append (e3 .get_ref ())
1211+ self ._add_inline_equations_to_parent (
1212+ doc = doc ,
1213+ parent = inline_equation ,
1214+ text = text ,
1215+ equations = equations ,
1216+ elem_ref = elem_ref ,
1217+ )
12331218
12341219 elif p_style_id in [
12351220 "Paragraph" ,
@@ -1425,28 +1410,99 @@ def _add_list_item_with_marker(
14251410 enum_marker = ""
14261411 self ._add_formatted_list_item (doc , elements , enum_marker , is_numbered , level )
14271412
1428- def _add_list_item (
1413+ def _add_inline_equations_to_parent (
1414+ self ,
1415+ * ,
1416+ doc : DoclingDocument ,
1417+ parent : NodeItem ,
1418+ text : str ,
1419+ equations : list [str ],
1420+ elem_ref : list [RefItem ] | None = None ,
1421+ ) -> None :
1422+ """Add text and inline equations as children of a parent element.
1423+
1424+ This helper method splits text by equation markers and adds alternating
1425+ TEXT and FORMULA elements as children of the given parent. This logic
1426+ is shared between regular paragraphs with inline equations and list items
1427+ with inline equations.
1428+
1429+ Args:
1430+ doc: The DoclingDocument being constructed.
1431+ parent: The parent element (inline_group) to add children to.
1432+ text: The paragraph text with equation placeholders (e.g., "<eq>formula</eq>").
1433+ equations: List of equation strings with markers (e.g., ["<eq>A=B</eq>", ...]).
1434+ elem_ref: Optional list to append created element references to.
1435+ """
1436+ text_tmp = text
1437+ for eq in equations :
1438+ if len (text_tmp ) == 0 :
1439+ break
1440+
1441+ split_text_tmp = text_tmp .split (eq .strip (), maxsplit = 1 )
1442+
1443+ pre_eq_text = split_text_tmp [0 ]
1444+ text_tmp = "" if len (split_text_tmp ) == 1 else split_text_tmp [1 ]
1445+
1446+ if len (pre_eq_text ) > 0 :
1447+ e1 = doc .add_text (
1448+ label = DocItemLabel .TEXT ,
1449+ parent = parent ,
1450+ text = pre_eq_text ,
1451+ content_layer = self .content_layer ,
1452+ )
1453+ if elem_ref is not None :
1454+ elem_ref .append (e1 .get_ref ())
1455+
1456+ e2 = doc .add_text (
1457+ label = DocItemLabel .FORMULA ,
1458+ parent = parent ,
1459+ text = eq .replace ("<eq>" , "" ).replace ("</eq>" , "" ),
1460+ content_layer = self .content_layer ,
1461+ )
1462+ if elem_ref is not None :
1463+ elem_ref .append (e2 .get_ref ())
1464+
1465+ if len (text_tmp ) > 0 :
1466+ e3 = doc .add_text (
1467+ label = DocItemLabel .TEXT ,
1468+ parent = parent ,
1469+ text = text_tmp .strip (),
1470+ content_layer = self .content_layer ,
1471+ )
1472+ if elem_ref is not None :
1473+ elem_ref .append (e3 .get_ref ())
1474+
1475+ def _manage_list_structure (
14291476 self ,
14301477 * ,
14311478 doc : DoclingDocument ,
14321479 numid : int ,
14331480 ilevel : int ,
1434- elements : list ,
1435- is_numbered : bool = False ,
1436- ) -> list [RefItem ]:
1437- elem_ref : list [RefItem ] = []
1438- # this method is always called with is_numbered. Numbered lists should be properly addressed.
1439- if not elements :
1440- return elem_ref
1481+ ) -> tuple [list [RefItem ], int ]:
1482+ """Manage list structure and return elem_ref and use_level.
1483+
1484+ This helper method handles the list group creation and level management
1485+ that is common to both regular list items and list items with equations.
1486+ It determines whether to open a new list, continue an existing one, handle
1487+ indentation changes, or close lists based on the numbering context.
1488+
1489+ Args:
1490+ doc: The DoclingDocument being constructed.
1491+ numid: The numbering ID from the DOCX paragraph properties.
1492+ ilevel: The indentation level from the DOCX paragraph properties.
14411493
1494+ Returns:
1495+ A tuple containing the list of references to created list groups and
1496+ the level at which the list item should be added.
1497+ """
1498+ elem_ref : list [RefItem ] = []
14421499 level = self ._get_level ()
14431500 prev_indent = self ._prev_indent ()
1501+
14441502 if self ._prev_numid () is None or (
14451503 self ._prev_numid () == numid and self .level_at_new_list is None
14461504 ): # Open new list
14471505 self .level_at_new_list = level
1448-
1449- # Reset counters for the new numbering sequence
14501506 self ._reset_list_counters_for_new_sequence (numid )
14511507
14521508 list_gr = doc .add_list_group (
@@ -1456,10 +1512,8 @@ def _add_list_item(
14561512 )
14571513 self .parents [level ] = list_gr
14581514 elem_ref .append (list_gr .get_ref ())
1515+ use_level = level
14591516
1460- self ._add_list_item_with_marker (
1461- doc , elements , numid , ilevel , is_numbered , level
1462- )
14631517 elif (
14641518 self ._prev_numid () == numid
14651519 and self .level_at_new_list is not None
@@ -1477,15 +1531,8 @@ def _add_list_item(
14771531 )
14781532 self .parents [i ] = list_gr1
14791533 elem_ref .append (list_gr1 .get_ref ())
1534+ use_level = self .level_at_new_list + ilevel
14801535
1481- self ._add_list_item_with_marker (
1482- doc ,
1483- elements ,
1484- numid ,
1485- ilevel ,
1486- is_numbered ,
1487- self .level_at_new_list + ilevel ,
1488- )
14891536 elif (
14901537 self ._prev_numid () == numid
14911538 and self .level_at_new_list is not None
@@ -1495,28 +1542,18 @@ def _add_list_item(
14951542 for k in self .parents :
14961543 if k > self .level_at_new_list + ilevel :
14971544 self .parents [k ] = None
1498-
1499- self ._add_list_item_with_marker (
1500- doc ,
1501- elements ,
1502- numid ,
1503- ilevel ,
1504- is_numbered ,
1505- self .level_at_new_list + ilevel ,
1506- )
1545+ use_level = self .level_at_new_list + ilevel
15071546
15081547 elif self ._prev_numid () == numid and isinstance (
15091548 self .parents .get (level - 1 ), ListGroup
15101549 ):
1511- # Continue existing list - only if parent is actually a ListGroup
1512- self ._add_list_item_with_marker (
1513- doc , elements , numid , ilevel , is_numbered , level - 1
1514- )
1550+ # Continue existing list
1551+ use_level = level - 1
1552+
15151553 elif self ._prev_numid () != numid or not isinstance (
15161554 self .parents .get (level - 1 ), ListGroup
15171555 ):
1518- # New list sequence: Different numid OR parent is not a ListGroup
1519- # Use anchor-based level to place new list at the correct document position
1556+ # New list sequence
15201557 if self .level_at_new_list is not None :
15211558 use_level = self .level_at_new_list + ilevel
15221559 for k in list (self .parents .keys ()):
@@ -1533,16 +1570,113 @@ def _add_list_item(
15331570 )
15341571 self .parents [use_level ] = list_gr
15351572 elem_ref .append (list_gr .get_ref ())
1573+ else :
1574+ use_level = level - 1
15361575
1537- # Set marker and enumerated arguments if this is an enumeration element.
1538- if is_numbered :
1539- self ._get_list_counter (numid , ilevel )
1540- enum_marker = self ._build_enum_marker (numid , ilevel )
1541- else :
1542- enum_marker = ""
1543- self ._add_formatted_list_item (
1544- doc , elements , enum_marker , is_numbered , use_level
1576+ return elem_ref , use_level
1577+
1578+ def _add_list_item (
1579+ self ,
1580+ * ,
1581+ doc : DoclingDocument ,
1582+ numid : int ,
1583+ ilevel : int ,
1584+ elements : list ,
1585+ is_numbered : bool = False ,
1586+ ) -> list [RefItem ]:
1587+ """Add a regular list item without inline equations.
1588+
1589+ Args:
1590+ doc: The DoclingDocument being constructed.
1591+ numid: The numbering ID from the DOCX paragraph properties.
1592+ ilevel: The indentation level from the DOCX paragraph properties.
1593+ elements: List of (text, formatting, hyperlink) tuples representing the paragraph content.
1594+ is_numbered: Whether this is a numbered list (True) or bulleted list (False).
1595+
1596+ Returns:
1597+ List of references to created document elements.
1598+ """
1599+ if not elements :
1600+ return []
1601+
1602+ elem_ref , use_level = self ._manage_list_structure (
1603+ doc = doc , numid = numid , ilevel = ilevel
1604+ )
1605+
1606+ if is_numbered :
1607+ self ._get_list_counter (numid , ilevel )
1608+ enum_marker = self ._build_enum_marker (numid , ilevel )
1609+ else :
1610+ enum_marker = ""
1611+
1612+ self ._add_formatted_list_item (
1613+ doc , elements , enum_marker , is_numbered , use_level
1614+ )
1615+ return elem_ref
1616+
1617+ def _add_list_item_with_equations (
1618+ self ,
1619+ * ,
1620+ doc : DoclingDocument ,
1621+ numid : int ,
1622+ ilevel : int ,
1623+ text : str ,
1624+ equations : list [str ],
1625+ is_numbered : bool = False ,
1626+ ) -> list [RefItem ]:
1627+ """Add a list item that contains inline equations.
1628+
1629+ This method handles list items with inline formulas by creating an inline_group
1630+ structure similar to how non-list paragraphs with equations are handled. The text
1631+ is split by equation markers, and alternating TEXT and FORMULA elements are added
1632+ as children of the inline_group.
1633+
1634+ Args:
1635+ doc: The DoclingDocument being constructed.
1636+ numid: The numbering ID from the DOCX paragraph properties.
1637+ ilevel: The indentation level from the DOCX paragraph properties.
1638+ text: The paragraph text with equation placeholders (e.g., "<eq>formula</eq>").
1639+ equations: List of equation strings with markers (e.g., ["<eq>A=B</eq>", ...]).
1640+ is_numbered: Whether this is a numbered list (True) or bulleted list (False).
1641+
1642+ Returns:
1643+ List of references to created document elements.
1644+ """
1645+ elem_ref , use_level = self ._manage_list_structure (
1646+ doc = doc , numid = numid , ilevel = ilevel
1647+ )
1648+
1649+ if is_numbered :
1650+ self ._get_list_counter (numid , ilevel )
1651+ enum_marker = self ._build_enum_marker (numid , ilevel )
1652+ else :
1653+ enum_marker = ""
1654+
1655+ if not isinstance (self .parents [use_level ], ListGroup ):
1656+ _log .warning (
1657+ "Parent element of the list item is not a ListGroup. The list item will be ignored."
15451658 )
1659+ return elem_ref
1660+
1661+ list_item = doc .add_list_item (
1662+ marker = enum_marker ,
1663+ enumerated = is_numbered ,
1664+ parent = self .parents [use_level ],
1665+ text = "" ,
1666+ )
1667+
1668+ inline_group = doc .add_inline_group (
1669+ parent = list_item ,
1670+ content_layer = self .content_layer ,
1671+ )
1672+
1673+ self ._add_inline_equations_to_parent (
1674+ doc = doc ,
1675+ parent = inline_group ,
1676+ text = text ,
1677+ equations = equations ,
1678+ )
1679+
15461680 return elem_ref
15471681
15481682 @staticmethod
0 commit comments