spitdoc/analyze_outline.py

import docx
import argparse
import os
import re

def analyze_document_structure(docx_path):
    """Analyze document structure to determine heading levels"""
    doc = docx.Document(docx_path)

    # Collect all paragraphs with heading styles
    heading_paragraphs = []

    for i, paragraph in enumerate(doc.paragraphs):
        style_name = paragraph.style.name

        # Check for heading styles by name
        if style_name.startswith('Heading') or '标题' in style_name:
            # Extract level number from style name if possible
            level_match = re.search(r'[标题Hh]eading\s*(\d+)|[标题標題]\s*(\d+)', style_name)
            level = None
            if level_match:
                level = int(level_match.group(1) or level_match.group(2))

            heading_paragraphs.append({
                'index': i,
                'text': paragraph.text,
                'style': style_name,
                'style_level': level,
                'indent': len(paragraph.text) - len(paragraph.text.lstrip())  # Simple indent detection
            })

    print("Document structure analysis:")
    print(f"Total heading paragraphs found: {len(heading_paragraphs)}")

    # Print all heading paragraphs
    for i, heading in enumerate(heading_paragraphs):
        print(f"{i+1:2d}. Style: '{heading['style']}', Level: {heading['style_level']}, Indent: {heading['indent']}")
        print(f"    Text: {heading['text'][:100]}")

    # Determine actual levels based on document structure
    print("\nAnalyzing document structure to determine actual heading levels:")

    # Simple approach: assume all headings with same style are at same level
    # For this document, all are "标题 11" but they are clearly different levels in document structure
    # We'll need to analyze content to determine real levels

    # Let's look at the text patterns to determine levels
    for i, heading in enumerate(heading_paragraphs):
        text = heading['text'].strip()
        # Common patterns for chapter/section headings in Chinese documents
        chapter_match = re.match(r'第[一二三四五六七八九十\d]+[章篇节]', text)
        section_match = re.match(r'[一二三四五六七八九十\d]+[、.]', text)
        subsection_match = re.match(r'[(（][一二三四五六七八九十\d]+[)）]', text)

        actual_level = 1  # Default to top level
        if chapter_match:
            actual_level = 1  # Chapter level
        elif section_match:
            actual_level = 2  # Section level
        elif subsection_match:
            actual_level = 3  # Subsection level

        print(f"{i+1:2d}. Text: {text[:30]:30s} | Style level: {heading['style_level'] or 'None':8} | Actual level: {actual_level}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Analyze document structure in DOCX file")
    parser.add_argument("docx_file", help="Path to the DOCX file")

    args = parser.parse_args()

    if os.path.exists(args.docx_file):
        analyze_document_structure(args.docx_file)
    else:
        print(f"File {args.docx_file} not found")