spitdoc/docx_to_md.py

306 lines
12 KiB
Python

import docx
import os
import argparse
from docx.shared import Inches
from docx.enum.text import WD_COLOR_INDEX
from docx.oxml.shared import qn
from docx.oxml import OxmlElement
import re
def get_used_outline_levels(doc):
"""Get all outline levels that are actually used in the document paragraphs"""
outline_levels = set()
for paragraph in doc.paragraphs:
try:
# Check if paragraph has outline level defined
if paragraph.style._element.pPr is not None and paragraph.style._element.pPr.outlineLvl is not None:
level = paragraph.style._element.pPr.outlineLvl.val
outline_levels.add(level)
except AttributeError:
pass
return sorted(outline_levels)
def get_heading_level_from_style(style_name):
"""Extract heading level from style name, supporting both English and Chinese styles"""
# Check for patterns like "Heading 1", "标题 1", "标题1", etc.
level_match = re.search(r'[Hh]eading\s*(\d+)|[标题標題]\s*(\d+)|[标题标题]\s*(\d+)', style_name)
if level_match:
return int(level_match.group(1) or level_match.group(2) or level_match.group(3))
# Check for patterns like "Heading1", "标题1", etc. (no space)
level_match = re.search(r'[Hh]eading(\d+)|[标题標題](\d+)|[标题标题](\d+)', style_name)
if level_match:
return int(level_match.group(1) or level_match.group(2) or level_match.group(3))
return None
def map_outline_levels_to_markdown_levels(outline_levels):
"""Map document outline levels to markdown heading levels (highest = #, next = ##, etc.)"""
if not outline_levels:
return {}
# Map outline levels to markdown levels (lowest outline level value = highest heading level)
# In Word, outline level 0 is the highest, level 1 is next, etc.
level_mapping = {}
for i, level in enumerate(sorted(outline_levels)):
level_mapping[level] = i + 1
return level_mapping
def convert_docx_to_md(docx_path, md_path):
"""
Convert a DOCX file to Markdown format.
Args:
docx_path (str): Path to the input DOCX file
md_path (str): Path to the output MD file
"""
# Load the document
doc = docx.Document(docx_path)
# Create directory for images if it doesn't exist
md_dir = os.path.dirname(md_path)
images_dir = os.path.join(md_dir, "images")
if not os.path.exists(images_dir):
os.makedirs(images_dir)
md_content = []
image_count = 1
# Extract all images first and create a mapping
image_mapping = {}
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
image = rel.target_part.blob
image_filename = f"image_{image_count}.png"
image_path = os.path.join(images_dir, image_filename)
with open(image_path, "wb") as f:
f.write(image)
# Store the relationship ID and image filename
image_mapping[rel.rId] = image_filename
image_count += 1
# Get outline levels that are actually used in the document and create mapping to markdown levels
used_outline_levels = get_used_outline_levels(doc)
level_mapping = map_outline_levels_to_markdown_levels(used_outline_levels)
# Print debug information
print(f"Used outline levels in document: {used_outline_levels}")
print(f"Mapping to Markdown levels: {level_mapping}")
# Create a more sophisticated approach to handle document structure
# We'll iterate through the document's XML elements to preserve order
# Get all body elements in order
body_elements = doc.element.body.xpath('./*')
# Keep track of which tables we've processed
processed_tables = set()
# Process each element in order
table_counter = 0
for element in body_elements:
# Check if it's a paragraph
if element.tag.endswith('p'):
# Convert to paragraph object
para = docx.text.paragraph.Paragraph(element, doc)
# Handle headings based on outline level or style name
md_heading_level = None
# First, try to get outline level from the paragraph's style
try:
if para.style._element.pPr is not None and para.style._element.pPr.outlineLvl is not None:
outline_level = para.style._element.pPr.outlineLvl.val
# Map to markdown level
if outline_level in level_mapping:
md_heading_level = level_mapping[outline_level]
except AttributeError:
pass
# If we can't get outline level, try to extract from style name
if md_heading_level is None:
style_level = get_heading_level_from_style(para.style.name)
if style_level is not None:
# For style-based levels, we'll map them directly but cap at reasonable levels
md_heading_level = min(style_level, 6) # Markdown supports up to 6 levels
if md_heading_level is not None:
# Convert to Markdown heading
md_content.append('#' * md_heading_level + ' ' + para.text + '\n')
else:
# Process runs for formatting
para_content = ""
for run in para.runs:
text = run.text
# Skip empty text
if not text:
continue
# Handle bold
if run.bold:
text = f"**{text}**"
# Handle italic
if run.italic:
text = f"*{text}*"
# Handle underline (not standard in MD, using emphasis)
if run.underline:
text = f"*{text}*"
# Handle strikethrough
if run.font.strike:
text = f"~~{text}~~"
# Handle highlight (convert to bold as approximation)
if run.font.highlight_color and run.font.highlight_color != WD_COLOR_INDEX.NONE:
text = f"**{text}**"
para_content += text
# Check for inline images in this paragraph
inline_images = []
# Look for drawing elements in the paragraph
drawing_elements = para._element.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
for drawing in drawing_elements:
# Find the blip (image) element
blip_elements = drawing.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}blip')
for blip in blip_elements:
# Get the embed attribute which references the image relationship
rId = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if rId and rId in image_mapping:
inline_images.append(image_mapping[rId])
# Add paragraph content
if para_content.strip() or inline_images:
# Add the paragraph text
if para_content.strip():
md_content.append(para_content + '\n')
# Add inline images that belong to this paragraph
for image_filename in inline_images:
md_content.append(f"\n![Image](images/{image_filename})\n")
# Check if it's a table
elif element.tag.endswith('tbl'):
# Find the corresponding table object
for i, table in enumerate(doc.tables):
if i not in processed_tables and table._element.xml == element.xml:
table_counter += 1
md_table = convert_table_to_md(table)
md_content.append(f"\n<!-- Table {table_counter} -->\n")
md_content.append(md_table)
processed_tables.add(i)
break
# Write to file
with open(md_path, "w", encoding="utf-8") as f:
f.write('\n'.join(md_content))
def convert_table_to_md(table):
"""
Convert a DOCX table to Markdown format.
Args:
table: A python-docx table object
Returns:
str: Markdown formatted table
"""
md_table = []
# Process all rows to find max cells per row
rows_data = []
max_cells = 0
for row in table.rows:
row_data = []
for cell in row.cells:
# Clean up cell text
cell_text = cell.text.strip().replace('\n', '<br>')
row_data.append(cell_text)
rows_data.append(row_data)
max_cells = max(max_cells, len(row_data))
# Ensure all rows have the same number of cells
for row_data in rows_data:
while len(row_data) < max_cells:
row_data.append("")
# Process header row
if rows_data:
header = "| " + " | ".join(rows_data[0]) + " |"
md_table.append(header)
# Add separator row
separator = "| " + " | ".join(["---" for _ in range(max_cells)]) + " |"
md_table.append(separator)
# Process data rows
for row_data in rows_data[1:]:
row_str = "| " + " | ".join(row_data) + " |"
md_table.append(row_str)
md_table.append("") # Add blank line after table
return "\n".join(md_table)
def extract_images_from_docx(docx_path, images_dir):
"""
Extract images from a DOCX file to a specified directory.
Args:
docx_path (str): Path to the DOCX file
images_dir (str): Directory to save images
Returns:
list: List of image filenames
"""
doc = docx.Document(docx_path)
image_filenames = []
if not os.path.exists(images_dir):
os.makedirs(images_dir)
image_count = 1
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
image = rel.target_part.blob
image_filename = f"image_{image_count}.png"
image_path = os.path.join(images_dir, image_filename)
with open(image_path, "wb") as f:
f.write(image)
image_filenames.append(image_filename)
image_count += 1
return image_filenames
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert DOCX file to Markdown format")
parser.add_argument("docx_file", help="Path to the input DOCX file")
parser.add_argument("output_dir", nargs='?', default=".", help="Output directory (default: current directory)")
args = parser.parse_args()
docx_file = args.docx_file
output_dir = args.output_dir
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Generate MD filename based on DOCX filename
docx_basename = os.path.splitext(os.path.basename(docx_file))[0]
md_file = os.path.join(output_dir, docx_basename + ".md")
if os.path.exists(docx_file):
convert_docx_to_md(docx_file, md_file)
print(f"Converted {docx_file} to {md_file}")
else:
print(f"File {docx_file} not found")
exit(1)