spitdoc/docx_to_md.py

import docx
import os
import argparse
from docx.shared import Inches
from docx.enum.text import WD_COLOR_INDEX
from docx.oxml.shared import qn
from docx.oxml import OxmlElement
import re

def get_used_outline_levels(doc):
    """Get all outline levels that are actually used in the document paragraphs"""
    outline_levels = set()

    for paragraph in doc.paragraphs:
        try:
            # Check if paragraph has outline level defined
            if paragraph.style._element.pPr is not None and paragraph.style._element.pPr.outlineLvl is not None:
                level = paragraph.style._element.pPr.outlineLvl.val
                outline_levels.add(level)
        except AttributeError:
            pass

    return sorted(outline_levels)

def get_heading_level_from_style(style_name):
    """Extract heading level from style name, supporting both English and Chinese styles"""
    # Check for patterns like "Heading 1", "标题 1", "标题1", etc.
    level_match = re.search(r'[Hh]eading\s*(\d+)|[标题標題]\s*(\d+)|[标题标题]\s*(\d+)', style_name)
    if level_match:
        return int(level_match.group(1) or level_match.group(2) or level_match.group(3))

    # Check for patterns like "Heading1", "标题1", etc. (no space)
    level_match = re.search(r'[Hh]eading(\d+)|[标题標題](\d+)|[标题标题](\d+)', style_name)
    if level_match:
        return int(level_match.group(1) or level_match.group(2) or level_match.group(3))

    return None

def map_outline_levels_to_markdown_levels(outline_levels):
    """Map document outline levels to markdown heading levels (highest = #, next = ##, etc.)"""
    if not outline_levels:
        return {}

    # Map outline levels to markdown levels (lowest outline level value = highest heading level)
    # In Word, outline level 0 is the highest, level 1 is next, etc.
    level_mapping = {}
    for i, level in enumerate(sorted(outline_levels)):
        level_mapping[level] = i + 1

    return level_mapping

def convert_docx_to_md(docx_path, md_path):
    """
    Convert a DOCX file to Markdown format.

    Args:
        docx_path (str): Path to the input DOCX file
        md_path (str): Path to the output MD file
    """
    # Load the document
    doc = docx.Document(docx_path)

    # Create directory for images if it doesn't exist
    md_dir = os.path.dirname(md_path)
    images_dir = os.path.join(md_dir, "images")
    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    md_content = []
    image_count = 1

    # Extract all images first and create a mapping
    image_mapping = {}
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            image = rel.target_part.blob
            image_filename = f"image_{image_count}.png"
            image_path = os.path.join(images_dir, image_filename)

            with open(image_path, "wb") as f:
                f.write(image)

            # Store the relationship ID and image filename
            image_mapping[rel.rId] = image_filename
            image_count += 1

    # Get outline levels that are actually used in the document and create mapping to markdown levels
    used_outline_levels = get_used_outline_levels(doc)
    level_mapping = map_outline_levels_to_markdown_levels(used_outline_levels)

    # Print debug information
    print(f"Used outline levels in document: {used_outline_levels}")
    print(f"Mapping to Markdown levels: {level_mapping}")

    # Create a more sophisticated approach to handle document structure
    # We'll iterate through the document's XML elements to preserve order

    # Get all body elements in order
    body_elements = doc.element.body.xpath('./*')

    # Keep track of which tables we've processed
    processed_tables = set()

    # Process each element in order
    table_counter = 0
    for element in body_elements:
        # Check if it's a paragraph
        if element.tag.endswith('p'):
            # Convert to paragraph object
            para = docx.text.paragraph.Paragraph(element, doc)

            # Handle headings based on outline level or style name
            md_heading_level = None

            # First, try to get outline level from the paragraph's style
            try:
                if para.style._element.pPr is not None and para.style._element.pPr.outlineLvl is not None:
                    outline_level = para.style._element.pPr.outlineLvl.val
                    # Map to markdown level
                    if outline_level in level_mapping:
                        md_heading_level = level_mapping[outline_level]
            except AttributeError:
                pass

            # If we can't get outline level, try to extract from style name
            if md_heading_level is None:
                style_level = get_heading_level_from_style(para.style.name)
                if style_level is not None:
                    # For style-based levels, we'll map them directly but cap at reasonable levels
                    md_heading_level = min(style_level, 6)  # Markdown supports up to 6 levels

            if md_heading_level is not None:
                # Convert to Markdown heading
                md_content.append('#' * md_heading_level + ' ' + para.text + '\n')
            else:
                # Process runs for formatting
                para_content = ""
                for run in para.runs:
                    text = run.text

                    # Skip empty text
                    if not text:
                        continue

                    # Handle bold
                    if run.bold:
                        text = f"**{text}**"
                    # Handle italic
                    if run.italic:
                        text = f"*{text}*"
                    # Handle underline (not standard in MD, using emphasis)
                    if run.underline:
                        text = f"*{text}*"
                    # Handle strikethrough
                    if run.font.strike:
                        text = f"~~{text}~~"
                    # Handle highlight (convert to bold as approximation)
                    if run.font.highlight_color and run.font.highlight_color != WD_COLOR_INDEX.NONE:
                        text = f"**{text}**"

                    para_content += text

                # Check for inline images in this paragraph
                inline_images = []
                # Look for drawing elements in the paragraph
                drawing_elements = para._element.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')

                for drawing in drawing_elements:
                    # Find the blip (image) element
                    blip_elements = drawing.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}blip')

                    for blip in blip_elements:
                        # Get the embed attribute which references the image relationship
                        rId = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                        if rId and rId in image_mapping:
                            inline_images.append(image_mapping[rId])

                # Add paragraph content
                if para_content.strip() or inline_images:
                    # Add the paragraph text
                    if para_content.strip():
                        md_content.append(para_content + '\n')

                    # Add inline images that belong to this paragraph
                    for image_filename in inline_images:
                        md_content.append(f"\n![Image](images/{image_filename})\n")

        # Check if it's a table
        elif element.tag.endswith('tbl'):
            # Find the corresponding table object
            for i, table in enumerate(doc.tables):
                if i not in processed_tables and table._element.xml == element.xml:
                    table_counter += 1
                    md_table = convert_table_to_md(table)
                    md_content.append(f"\n<!-- Table {table_counter} -->\n")
                    md_content.append(md_table)
                    processed_tables.add(i)
                    break

    # Write to file
    with open(md_path, "w", encoding="utf-8") as f:
        f.write('\n'.join(md_content))

def convert_table_to_md(table):
    """
    Convert a DOCX table to Markdown format.

    Args:
        table: A python-docx table object

    Returns:
        str: Markdown formatted table
    """
    md_table = []

    # Process all rows to find max cells per row
    rows_data = []
    max_cells = 0

    for row in table.rows:
        row_data = []
        for cell in row.cells:
            # Clean up cell text
            cell_text = cell.text.strip().replace('\n', '<br>')
            row_data.append(cell_text)
        rows_data.append(row_data)
        max_cells = max(max_cells, len(row_data))

    # Ensure all rows have the same number of cells
    for row_data in rows_data:
        while len(row_data) < max_cells:
            row_data.append("")

    # Process header row
    if rows_data:
        header = "| " + " | ".join(rows_data[0]) + " |"
        md_table.append(header)

        # Add separator row
        separator = "| " + " | ".join(["---" for _ in range(max_cells)]) + " |"
        md_table.append(separator)

        # Process data rows
        for row_data in rows_data[1:]:
            row_str = "| " + " | ".join(row_data) + " |"
            md_table.append(row_str)

    md_table.append("")  # Add blank line after table
    return "\n".join(md_table)

def extract_images_from_docx(docx_path, images_dir):
    """
    Extract images from a DOCX file to a specified directory.

    Args:
        docx_path (str): Path to the DOCX file
        images_dir (str): Directory to save images

    Returns:
        list: List of image filenames
    """
    doc = docx.Document(docx_path)
    image_filenames = []

    if not os.path.exists(images_dir):
        os.makedirs(images_dir)

    image_count = 1
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            image = rel.target_part.blob
            image_filename = f"image_{image_count}.png"
            image_path = os.path.join(images_dir, image_filename)

            with open(image_path, "wb") as f:
                f.write(image)

            image_filenames.append(image_filename)
            image_count += 1

    return image_filenames

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert DOCX file to Markdown format")
    parser.add_argument("docx_file", help="Path to the input DOCX file")
    parser.add_argument("output_dir", nargs='?', default=".", help="Output directory (default: current directory)")

    args = parser.parse_args()

    docx_file = args.docx_file
    output_dir = args.output_dir

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Generate MD filename based on DOCX filename
    docx_basename = os.path.splitext(os.path.basename(docx_file))[0]
    md_file = os.path.join(output_dir, docx_basename + ".md")

    if os.path.exists(docx_file):
        convert_docx_to_md(docx_file, md_file)
        print(f"Converted {docx_file} to {md_file}")
    else:
        print(f"File {docx_file} not found")
        exit(1)