74 lines
3.1 KiB
Python
74 lines
3.1 KiB
Python
import docx
|
||
import argparse
|
||
import os
|
||
import re
|
||
|
||
def analyze_document_structure(docx_path):
|
||
"""Analyze document structure to determine heading levels"""
|
||
doc = docx.Document(docx_path)
|
||
|
||
# Collect all paragraphs with heading styles
|
||
heading_paragraphs = []
|
||
|
||
for i, paragraph in enumerate(doc.paragraphs):
|
||
style_name = paragraph.style.name
|
||
|
||
# Check for heading styles by name
|
||
if style_name.startswith('Heading') or '标题' in style_name:
|
||
# Extract level number from style name if possible
|
||
level_match = re.search(r'[标题Hh]eading\s*(\d+)|[标题標題]\s*(\d+)', style_name)
|
||
level = None
|
||
if level_match:
|
||
level = int(level_match.group(1) or level_match.group(2))
|
||
|
||
heading_paragraphs.append({
|
||
'index': i,
|
||
'text': paragraph.text,
|
||
'style': style_name,
|
||
'style_level': level,
|
||
'indent': len(paragraph.text) - len(paragraph.text.lstrip()) # Simple indent detection
|
||
})
|
||
|
||
print("Document structure analysis:")
|
||
print(f"Total heading paragraphs found: {len(heading_paragraphs)}")
|
||
|
||
# Print all heading paragraphs
|
||
for i, heading in enumerate(heading_paragraphs):
|
||
print(f"{i+1:2d}. Style: '{heading['style']}', Level: {heading['style_level']}, Indent: {heading['indent']}")
|
||
print(f" Text: {heading['text'][:100]}")
|
||
|
||
# Determine actual levels based on document structure
|
||
print("\nAnalyzing document structure to determine actual heading levels:")
|
||
|
||
# Simple approach: assume all headings with same style are at same level
|
||
# For this document, all are "标题 11" but they are clearly different levels in document structure
|
||
# We'll need to analyze content to determine real levels
|
||
|
||
# Let's look at the text patterns to determine levels
|
||
for i, heading in enumerate(heading_paragraphs):
|
||
text = heading['text'].strip()
|
||
# Common patterns for chapter/section headings in Chinese documents
|
||
chapter_match = re.match(r'第[一二三四五六七八九十\d]+[章篇节]', text)
|
||
section_match = re.match(r'[一二三四五六七八九十\d]+[、.]', text)
|
||
subsection_match = re.match(r'[((][一二三四五六七八九十\d]+[))]', text)
|
||
|
||
actual_level = 1 # Default to top level
|
||
if chapter_match:
|
||
actual_level = 1 # Chapter level
|
||
elif section_match:
|
||
actual_level = 2 # Section level
|
||
elif subsection_match:
|
||
actual_level = 3 # Subsection level
|
||
|
||
print(f"{i+1:2d}. Text: {text[:30]:30s} | Style level: {heading['style_level'] or 'None':8} | Actual level: {actual_level}")
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser(description="Analyze document structure in DOCX file")
|
||
parser.add_argument("docx_file", help="Path to the DOCX file")
|
||
|
||
args = parser.parse_args()
|
||
|
||
if os.path.exists(args.docx_file):
|
||
analyze_document_structure(args.docx_file)
|
||
else:
|
||
print(f"File {args.docx_file} not found") |