豆豆友情提示:这是一个非官方 GitHub 代理镜像,主要用于网络测试或访问加速。请勿在此进行登录、注册或处理任何敏感信息。进行这些操作请务必访问官方网站 github.com。 Raw 内容也通过此代理提供。
Skip to content

Commit c761512

Browse files
authored
fix(docx): handle inline formulas in list items (#3304)
* fix(docx) Handle inline formulas in list items Fixes issue where inline formulas in list items were ignored during conversion. Added helper methods to eliminate code duplication. Updated test data with list items containing inline equations. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(docx): collect element refs in _add_inline_equations_to_parent Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
1 parent 3bab6b4 commit c761512

File tree

5 files changed

+683
-227
lines changed

5 files changed

+683
-227
lines changed

docling/backend/msword_backend.py

Lines changed: 218 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,13 +1115,24 @@ def _handle_text_elements(
11151115
# Check if this is actually a numbered list by examining the numFmt
11161116
is_numbered = self._is_numbered_list(numid, ilevel)
11171117

1118-
li = self._add_list_item(
1119-
doc=doc,
1120-
numid=numid,
1121-
ilevel=ilevel,
1122-
elements=paragraph_elements,
1123-
is_numbered=is_numbered,
1124-
)
1118+
# If there are equations in the list item, handle them specially
1119+
if len(equations) > 0:
1120+
li = self._add_list_item_with_equations(
1121+
doc=doc,
1122+
numid=numid,
1123+
ilevel=ilevel,
1124+
text=text,
1125+
equations=equations,
1126+
is_numbered=is_numbered,
1127+
)
1128+
else:
1129+
li = self._add_list_item(
1130+
doc=doc,
1131+
numid=numid,
1132+
ilevel=ilevel,
1133+
elements=paragraph_elements,
1134+
is_numbered=is_numbered,
1135+
)
11251136
elem_ref.extend(li) # MUST BE REF!!!
11261137
self._update_history(p_style_id, p_level, numid, ilevel)
11271138
return elem_ref
@@ -1196,40 +1207,14 @@ def _handle_text_elements(
11961207
parent=self.parents[level - 1], content_layer=self.content_layer
11971208
)
11981209
elem_ref.append(inline_equation.get_ref())
1199-
text_tmp = text
1200-
for eq in equations:
1201-
if len(text_tmp) == 0:
1202-
break
1203-
1204-
split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
12051210

1206-
pre_eq_text = split_text_tmp[0]
1207-
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
1208-
1209-
if len(pre_eq_text) > 0:
1210-
e1 = doc.add_text(
1211-
label=DocItemLabel.TEXT,
1212-
parent=inline_equation,
1213-
text=pre_eq_text,
1214-
content_layer=self.content_layer,
1215-
)
1216-
elem_ref.append(e1.get_ref())
1217-
e2 = doc.add_text(
1218-
label=DocItemLabel.FORMULA,
1219-
parent=inline_equation,
1220-
text=eq.replace("<eq>", "").replace("</eq>", ""),
1221-
content_layer=self.content_layer,
1222-
)
1223-
elem_ref.append(e2.get_ref())
1224-
1225-
if len(text_tmp) > 0:
1226-
e3 = doc.add_text(
1227-
label=DocItemLabel.TEXT,
1228-
parent=inline_equation,
1229-
text=text_tmp.strip(),
1230-
content_layer=self.content_layer,
1231-
)
1232-
elem_ref.append(e3.get_ref())
1211+
self._add_inline_equations_to_parent(
1212+
doc=doc,
1213+
parent=inline_equation,
1214+
text=text,
1215+
equations=equations,
1216+
elem_ref=elem_ref,
1217+
)
12331218

12341219
elif p_style_id in [
12351220
"Paragraph",
@@ -1425,28 +1410,99 @@ def _add_list_item_with_marker(
14251410
enum_marker = ""
14261411
self._add_formatted_list_item(doc, elements, enum_marker, is_numbered, level)
14271412

1428-
def _add_list_item(
1413+
def _add_inline_equations_to_parent(
1414+
self,
1415+
*,
1416+
doc: DoclingDocument,
1417+
parent: NodeItem,
1418+
text: str,
1419+
equations: list[str],
1420+
elem_ref: list[RefItem] | None = None,
1421+
) -> None:
1422+
"""Add text and inline equations as children of a parent element.
1423+
1424+
This helper method splits text by equation markers and adds alternating
1425+
TEXT and FORMULA elements as children of the given parent. This logic
1426+
is shared between regular paragraphs with inline equations and list items
1427+
with inline equations.
1428+
1429+
Args:
1430+
doc: The DoclingDocument being constructed.
1431+
parent: The parent element (inline_group) to add children to.
1432+
text: The paragraph text with equation placeholders (e.g., "<eq>formula</eq>").
1433+
equations: List of equation strings with markers (e.g., ["<eq>A=B</eq>", ...]).
1434+
elem_ref: Optional list to append created element references to.
1435+
"""
1436+
text_tmp = text
1437+
for eq in equations:
1438+
if len(text_tmp) == 0:
1439+
break
1440+
1441+
split_text_tmp = text_tmp.split(eq.strip(), maxsplit=1)
1442+
1443+
pre_eq_text = split_text_tmp[0]
1444+
text_tmp = "" if len(split_text_tmp) == 1 else split_text_tmp[1]
1445+
1446+
if len(pre_eq_text) > 0:
1447+
e1 = doc.add_text(
1448+
label=DocItemLabel.TEXT,
1449+
parent=parent,
1450+
text=pre_eq_text,
1451+
content_layer=self.content_layer,
1452+
)
1453+
if elem_ref is not None:
1454+
elem_ref.append(e1.get_ref())
1455+
1456+
e2 = doc.add_text(
1457+
label=DocItemLabel.FORMULA,
1458+
parent=parent,
1459+
text=eq.replace("<eq>", "").replace("</eq>", ""),
1460+
content_layer=self.content_layer,
1461+
)
1462+
if elem_ref is not None:
1463+
elem_ref.append(e2.get_ref())
1464+
1465+
if len(text_tmp) > 0:
1466+
e3 = doc.add_text(
1467+
label=DocItemLabel.TEXT,
1468+
parent=parent,
1469+
text=text_tmp.strip(),
1470+
content_layer=self.content_layer,
1471+
)
1472+
if elem_ref is not None:
1473+
elem_ref.append(e3.get_ref())
1474+
1475+
def _manage_list_structure(
14291476
self,
14301477
*,
14311478
doc: DoclingDocument,
14321479
numid: int,
14331480
ilevel: int,
1434-
elements: list,
1435-
is_numbered: bool = False,
1436-
) -> list[RefItem]:
1437-
elem_ref: list[RefItem] = []
1438-
# this method is always called with is_numbered. Numbered lists should be properly addressed.
1439-
if not elements:
1440-
return elem_ref
1481+
) -> tuple[list[RefItem], int]:
1482+
"""Manage list structure and return elem_ref and use_level.
1483+
1484+
This helper method handles the list group creation and level management
1485+
that is common to both regular list items and list items with equations.
1486+
It determines whether to open a new list, continue an existing one, handle
1487+
indentation changes, or close lists based on the numbering context.
1488+
1489+
Args:
1490+
doc: The DoclingDocument being constructed.
1491+
numid: The numbering ID from the DOCX paragraph properties.
1492+
ilevel: The indentation level from the DOCX paragraph properties.
14411493
1494+
Returns:
1495+
A tuple containing the list of references to created list groups and
1496+
the level at which the list item should be added.
1497+
"""
1498+
elem_ref: list[RefItem] = []
14421499
level = self._get_level()
14431500
prev_indent = self._prev_indent()
1501+
14441502
if self._prev_numid() is None or (
14451503
self._prev_numid() == numid and self.level_at_new_list is None
14461504
): # Open new list
14471505
self.level_at_new_list = level
1448-
1449-
# Reset counters for the new numbering sequence
14501506
self._reset_list_counters_for_new_sequence(numid)
14511507

14521508
list_gr = doc.add_list_group(
@@ -1456,10 +1512,8 @@ def _add_list_item(
14561512
)
14571513
self.parents[level] = list_gr
14581514
elem_ref.append(list_gr.get_ref())
1515+
use_level = level
14591516

1460-
self._add_list_item_with_marker(
1461-
doc, elements, numid, ilevel, is_numbered, level
1462-
)
14631517
elif (
14641518
self._prev_numid() == numid
14651519
and self.level_at_new_list is not None
@@ -1477,15 +1531,8 @@ def _add_list_item(
14771531
)
14781532
self.parents[i] = list_gr1
14791533
elem_ref.append(list_gr1.get_ref())
1534+
use_level = self.level_at_new_list + ilevel
14801535

1481-
self._add_list_item_with_marker(
1482-
doc,
1483-
elements,
1484-
numid,
1485-
ilevel,
1486-
is_numbered,
1487-
self.level_at_new_list + ilevel,
1488-
)
14891536
elif (
14901537
self._prev_numid() == numid
14911538
and self.level_at_new_list is not None
@@ -1495,28 +1542,18 @@ def _add_list_item(
14951542
for k in self.parents:
14961543
if k > self.level_at_new_list + ilevel:
14971544
self.parents[k] = None
1498-
1499-
self._add_list_item_with_marker(
1500-
doc,
1501-
elements,
1502-
numid,
1503-
ilevel,
1504-
is_numbered,
1505-
self.level_at_new_list + ilevel,
1506-
)
1545+
use_level = self.level_at_new_list + ilevel
15071546

15081547
elif self._prev_numid() == numid and isinstance(
15091548
self.parents.get(level - 1), ListGroup
15101549
):
1511-
# Continue existing list - only if parent is actually a ListGroup
1512-
self._add_list_item_with_marker(
1513-
doc, elements, numid, ilevel, is_numbered, level - 1
1514-
)
1550+
# Continue existing list
1551+
use_level = level - 1
1552+
15151553
elif self._prev_numid() != numid or not isinstance(
15161554
self.parents.get(level - 1), ListGroup
15171555
):
1518-
# New list sequence: Different numid OR parent is not a ListGroup
1519-
# Use anchor-based level to place new list at the correct document position
1556+
# New list sequence
15201557
if self.level_at_new_list is not None:
15211558
use_level = self.level_at_new_list + ilevel
15221559
for k in list(self.parents.keys()):
@@ -1533,16 +1570,113 @@ def _add_list_item(
15331570
)
15341571
self.parents[use_level] = list_gr
15351572
elem_ref.append(list_gr.get_ref())
1573+
else:
1574+
use_level = level - 1
15361575

1537-
# Set marker and enumerated arguments if this is an enumeration element.
1538-
if is_numbered:
1539-
self._get_list_counter(numid, ilevel)
1540-
enum_marker = self._build_enum_marker(numid, ilevel)
1541-
else:
1542-
enum_marker = ""
1543-
self._add_formatted_list_item(
1544-
doc, elements, enum_marker, is_numbered, use_level
1576+
return elem_ref, use_level
1577+
1578+
def _add_list_item(
1579+
self,
1580+
*,
1581+
doc: DoclingDocument,
1582+
numid: int,
1583+
ilevel: int,
1584+
elements: list,
1585+
is_numbered: bool = False,
1586+
) -> list[RefItem]:
1587+
"""Add a regular list item without inline equations.
1588+
1589+
Args:
1590+
doc: The DoclingDocument being constructed.
1591+
numid: The numbering ID from the DOCX paragraph properties.
1592+
ilevel: The indentation level from the DOCX paragraph properties.
1593+
elements: List of (text, formatting, hyperlink) tuples representing the paragraph content.
1594+
is_numbered: Whether this is a numbered list (True) or bulleted list (False).
1595+
1596+
Returns:
1597+
List of references to created document elements.
1598+
"""
1599+
if not elements:
1600+
return []
1601+
1602+
elem_ref, use_level = self._manage_list_structure(
1603+
doc=doc, numid=numid, ilevel=ilevel
1604+
)
1605+
1606+
if is_numbered:
1607+
self._get_list_counter(numid, ilevel)
1608+
enum_marker = self._build_enum_marker(numid, ilevel)
1609+
else:
1610+
enum_marker = ""
1611+
1612+
self._add_formatted_list_item(
1613+
doc, elements, enum_marker, is_numbered, use_level
1614+
)
1615+
return elem_ref
1616+
1617+
def _add_list_item_with_equations(
1618+
self,
1619+
*,
1620+
doc: DoclingDocument,
1621+
numid: int,
1622+
ilevel: int,
1623+
text: str,
1624+
equations: list[str],
1625+
is_numbered: bool = False,
1626+
) -> list[RefItem]:
1627+
"""Add a list item that contains inline equations.
1628+
1629+
This method handles list items with inline formulas by creating an inline_group
1630+
structure similar to how non-list paragraphs with equations are handled. The text
1631+
is split by equation markers, and alternating TEXT and FORMULA elements are added
1632+
as children of the inline_group.
1633+
1634+
Args:
1635+
doc: The DoclingDocument being constructed.
1636+
numid: The numbering ID from the DOCX paragraph properties.
1637+
ilevel: The indentation level from the DOCX paragraph properties.
1638+
text: The paragraph text with equation placeholders (e.g., "<eq>formula</eq>").
1639+
equations: List of equation strings with markers (e.g., ["<eq>A=B</eq>", ...]).
1640+
is_numbered: Whether this is a numbered list (True) or bulleted list (False).
1641+
1642+
Returns:
1643+
List of references to created document elements.
1644+
"""
1645+
elem_ref, use_level = self._manage_list_structure(
1646+
doc=doc, numid=numid, ilevel=ilevel
1647+
)
1648+
1649+
if is_numbered:
1650+
self._get_list_counter(numid, ilevel)
1651+
enum_marker = self._build_enum_marker(numid, ilevel)
1652+
else:
1653+
enum_marker = ""
1654+
1655+
if not isinstance(self.parents[use_level], ListGroup):
1656+
_log.warning(
1657+
"Parent element of the list item is not a ListGroup. The list item will be ignored."
15451658
)
1659+
return elem_ref
1660+
1661+
list_item = doc.add_list_item(
1662+
marker=enum_marker,
1663+
enumerated=is_numbered,
1664+
parent=self.parents[use_level],
1665+
text="",
1666+
)
1667+
1668+
inline_group = doc.add_inline_group(
1669+
parent=list_item,
1670+
content_layer=self.content_layer,
1671+
)
1672+
1673+
self._add_inline_equations_to_parent(
1674+
doc=doc,
1675+
parent=inline_group,
1676+
text=text,
1677+
equations=equations,
1678+
)
1679+
15461680
return elem_ref
15471681

15481682
@staticmethod

tests/data/docx/equations.docx

1.46 KB
Binary file not shown.

0 commit comments

Comments
 (0)