spitdoc/analyze_outline.py

74 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import docx
import argparse
import os
import re
def analyze_document_structure(docx_path):
"""Analyze document structure to determine heading levels"""
doc = docx.Document(docx_path)
# Collect all paragraphs with heading styles
heading_paragraphs = []
for i, paragraph in enumerate(doc.paragraphs):
style_name = paragraph.style.name
# Check for heading styles by name
if style_name.startswith('Heading') or '标题' in style_name:
# Extract level number from style name if possible
level_match = re.search(r'[标题Hh]eading\s*(\d+)|[标题標題]\s*(\d+)', style_name)
level = None
if level_match:
level = int(level_match.group(1) or level_match.group(2))
heading_paragraphs.append({
'index': i,
'text': paragraph.text,
'style': style_name,
'style_level': level,
'indent': len(paragraph.text) - len(paragraph.text.lstrip()) # Simple indent detection
})
print("Document structure analysis:")
print(f"Total heading paragraphs found: {len(heading_paragraphs)}")
# Print all heading paragraphs
for i, heading in enumerate(heading_paragraphs):
print(f"{i+1:2d}. Style: '{heading['style']}', Level: {heading['style_level']}, Indent: {heading['indent']}")
print(f" Text: {heading['text'][:100]}")
# Determine actual levels based on document structure
print("\nAnalyzing document structure to determine actual heading levels:")
# Simple approach: assume all headings with same style are at same level
# For this document, all are "标题 11" but they are clearly different levels in document structure
# We'll need to analyze content to determine real levels
# Let's look at the text patterns to determine levels
for i, heading in enumerate(heading_paragraphs):
text = heading['text'].strip()
# Common patterns for chapter/section headings in Chinese documents
chapter_match = re.match(r'第[一二三四五六七八九十\d]+[章篇节]', text)
section_match = re.match(r'[一二三四五六七八九十\d]+[、.]', text)
subsection_match = re.match(r'[(][一二三四五六七八九十\d]+[)]', text)
actual_level = 1 # Default to top level
if chapter_match:
actual_level = 1 # Chapter level
elif section_match:
actual_level = 2 # Section level
elif subsection_match:
actual_level = 3 # Subsection level
print(f"{i+1:2d}. Text: {text[:30]:30s} | Style level: {heading['style_level'] or 'None':8} | Actual level: {actual_level}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze document structure in DOCX file")
parser.add_argument("docx_file", help="Path to the DOCX file")
args = parser.parse_args()
if os.path.exists(args.docx_file):
analyze_document_structure(args.docx_file)
else:
print(f"File {args.docx_file} not found")