import docx import argparse import os import re def analyze_document_structure(docx_path): """Analyze document structure to determine heading levels""" doc = docx.Document(docx_path) # Collect all paragraphs with heading styles heading_paragraphs = [] for i, paragraph in enumerate(doc.paragraphs): style_name = paragraph.style.name # Check for heading styles by name if style_name.startswith('Heading') or '标题' in style_name: # Extract level number from style name if possible level_match = re.search(r'[标题Hh]eading\s*(\d+)|[标题標題]\s*(\d+)', style_name) level = None if level_match: level = int(level_match.group(1) or level_match.group(2)) heading_paragraphs.append({ 'index': i, 'text': paragraph.text, 'style': style_name, 'style_level': level, 'indent': len(paragraph.text) - len(paragraph.text.lstrip()) # Simple indent detection }) print("Document structure analysis:") print(f"Total heading paragraphs found: {len(heading_paragraphs)}") # Print all heading paragraphs for i, heading in enumerate(heading_paragraphs): print(f"{i+1:2d}. Style: '{heading['style']}', Level: {heading['style_level']}, Indent: {heading['indent']}") print(f" Text: {heading['text'][:100]}") # Determine actual levels based on document structure print("\nAnalyzing document structure to determine actual heading levels:") # Simple approach: assume all headings with same style are at same level # For this document, all are "标题 11" but they are clearly different levels in document structure # We'll need to analyze content to determine real levels # Let's look at the text patterns to determine levels for i, heading in enumerate(heading_paragraphs): text = heading['text'].strip() # Common patterns for chapter/section headings in Chinese documents chapter_match = re.match(r'第[一二三四五六七八九十\d]+[章篇节]', text) section_match = re.match(r'[一二三四五六七八九十\d]+[、.]', text) subsection_match = re.match(r'[((][一二三四五六七八九十\d]+[))]', text) actual_level = 1 # Default to top level if chapter_match: actual_level = 1 # Chapter level elif section_match: actual_level = 2 # Section level elif subsection_match: actual_level = 3 # Subsection level print(f"{i+1:2d}. Text: {text[:30]:30s} | Style level: {heading['style_level'] or 'None':8} | Actual level: {actual_level}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Analyze document structure in DOCX file") parser.add_argument("docx_file", help="Path to the DOCX file") args = parser.parse_args() if os.path.exists(args.docx_file): analyze_document_structure(args.docx_file) else: print(f"File {args.docx_file} not found")