import docx import os import argparse from docx.shared import Inches from docx.enum.text import WD_COLOR_INDEX from docx.oxml.shared import qn from docx.oxml import OxmlElement import re def get_used_outline_levels(doc): """Get all outline levels that are actually used in the document paragraphs""" outline_levels = set() for paragraph in doc.paragraphs: try: # Check if paragraph has outline level defined if paragraph.style._element.pPr is not None and paragraph.style._element.pPr.outlineLvl is not None: level = paragraph.style._element.pPr.outlineLvl.val outline_levels.add(level) except AttributeError: pass return sorted(outline_levels) def get_heading_level_from_style(style_name): """Extract heading level from style name, supporting both English and Chinese styles""" # Check for patterns like "Heading 1", "标题 1", "标题1", etc. level_match = re.search(r'[Hh]eading\s*(\d+)|[标题標題]\s*(\d+)|[标题标题]\s*(\d+)', style_name) if level_match: return int(level_match.group(1) or level_match.group(2) or level_match.group(3)) # Check for patterns like "Heading1", "标题1", etc. (no space) level_match = re.search(r'[Hh]eading(\d+)|[标题標題](\d+)|[标题标题](\d+)', style_name) if level_match: return int(level_match.group(1) or level_match.group(2) or level_match.group(3)) return None def map_outline_levels_to_markdown_levels(outline_levels): """Map document outline levels to markdown heading levels (highest = #, next = ##, etc.)""" if not outline_levels: return {} # Map outline levels to markdown levels (lowest outline level value = highest heading level) # In Word, outline level 0 is the highest, level 1 is next, etc. level_mapping = {} for i, level in enumerate(sorted(outline_levels)): level_mapping[level] = i + 1 return level_mapping def convert_docx_to_md(docx_path, md_path): """ Convert a DOCX file to Markdown format. Args: docx_path (str): Path to the input DOCX file md_path (str): Path to the output MD file """ # Load the document doc = docx.Document(docx_path) # Create directory for images if it doesn't exist md_dir = os.path.dirname(md_path) images_dir = os.path.join(md_dir, "images") if not os.path.exists(images_dir): os.makedirs(images_dir) md_content = [] image_count = 1 # Extract all images first and create a mapping image_mapping = {} for rel in doc.part.rels.values(): if "image" in rel.target_ref: image = rel.target_part.blob image_filename = f"image_{image_count}.png" image_path = os.path.join(images_dir, image_filename) with open(image_path, "wb") as f: f.write(image) # Store the relationship ID and image filename image_mapping[rel.rId] = image_filename image_count += 1 # Get outline levels that are actually used in the document and create mapping to markdown levels used_outline_levels = get_used_outline_levels(doc) level_mapping = map_outline_levels_to_markdown_levels(used_outline_levels) # Print debug information print(f"Used outline levels in document: {used_outline_levels}") print(f"Mapping to Markdown levels: {level_mapping}") # Create a more sophisticated approach to handle document structure # We'll iterate through the document's XML elements to preserve order # Get all body elements in order body_elements = doc.element.body.xpath('./*') # Keep track of which tables we've processed processed_tables = set() # Process each element in order table_counter = 0 for element in body_elements: # Check if it's a paragraph if element.tag.endswith('p'): # Convert to paragraph object para = docx.text.paragraph.Paragraph(element, doc) # Handle headings based on outline level or style name md_heading_level = None # First, try to get outline level from the paragraph's style try: if para.style._element.pPr is not None and para.style._element.pPr.outlineLvl is not None: outline_level = para.style._element.pPr.outlineLvl.val # Map to markdown level if outline_level in level_mapping: md_heading_level = level_mapping[outline_level] except AttributeError: pass # If we can't get outline level, try to extract from style name if md_heading_level is None: style_level = get_heading_level_from_style(para.style.name) if style_level is not None: # For style-based levels, we'll map them directly but cap at reasonable levels md_heading_level = min(style_level, 6) # Markdown supports up to 6 levels if md_heading_level is not None: # Convert to Markdown heading md_content.append('#' * md_heading_level + ' ' + para.text + '\n') else: # Process runs for formatting para_content = "" for run in para.runs: text = run.text # Skip empty text if not text: continue # Handle bold if run.bold: text = f"**{text}**" # Handle italic if run.italic: text = f"*{text}*" # Handle underline (not standard in MD, using emphasis) if run.underline: text = f"*{text}*" # Handle strikethrough if run.font.strike: text = f"~~{text}~~" # Handle highlight (convert to bold as approximation) if run.font.highlight_color and run.font.highlight_color != WD_COLOR_INDEX.NONE: text = f"**{text}**" para_content += text # Check for inline images in this paragraph inline_images = [] # Look for drawing elements in the paragraph drawing_elements = para._element.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing') for drawing in drawing_elements: # Find the blip (image) element blip_elements = drawing.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}blip') for blip in blip_elements: # Get the embed attribute which references the image relationship rId = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') if rId and rId in image_mapping: inline_images.append(image_mapping[rId]) # Add paragraph content if para_content.strip() or inline_images: # Add the paragraph text if para_content.strip(): md_content.append(para_content + '\n') # Add inline images that belong to this paragraph for image_filename in inline_images: md_content.append(f"\n![Image](images/{image_filename})\n") # Check if it's a table elif element.tag.endswith('tbl'): # Find the corresponding table object for i, table in enumerate(doc.tables): if i not in processed_tables and table._element.xml == element.xml: table_counter += 1 md_table = convert_table_to_md(table) md_content.append(f"\n\n") md_content.append(md_table) processed_tables.add(i) break # Write to file with open(md_path, "w", encoding="utf-8") as f: f.write('\n'.join(md_content)) def convert_table_to_md(table): """ Convert a DOCX table to Markdown format. Args: table: A python-docx table object Returns: str: Markdown formatted table """ md_table = [] # Process all rows to find max cells per row rows_data = [] max_cells = 0 for row in table.rows: row_data = [] for cell in row.cells: # Clean up cell text cell_text = cell.text.strip().replace('\n', '
') row_data.append(cell_text) rows_data.append(row_data) max_cells = max(max_cells, len(row_data)) # Ensure all rows have the same number of cells for row_data in rows_data: while len(row_data) < max_cells: row_data.append("") # Process header row if rows_data: header = "| " + " | ".join(rows_data[0]) + " |" md_table.append(header) # Add separator row separator = "| " + " | ".join(["---" for _ in range(max_cells)]) + " |" md_table.append(separator) # Process data rows for row_data in rows_data[1:]: row_str = "| " + " | ".join(row_data) + " |" md_table.append(row_str) md_table.append("") # Add blank line after table return "\n".join(md_table) def extract_images_from_docx(docx_path, images_dir): """ Extract images from a DOCX file to a specified directory. Args: docx_path (str): Path to the DOCX file images_dir (str): Directory to save images Returns: list: List of image filenames """ doc = docx.Document(docx_path) image_filenames = [] if not os.path.exists(images_dir): os.makedirs(images_dir) image_count = 1 for rel in doc.part.rels.values(): if "image" in rel.target_ref: image = rel.target_part.blob image_filename = f"image_{image_count}.png" image_path = os.path.join(images_dir, image_filename) with open(image_path, "wb") as f: f.write(image) image_filenames.append(image_filename) image_count += 1 return image_filenames if __name__ == "__main__": parser = argparse.ArgumentParser(description="Convert DOCX file to Markdown format") parser.add_argument("docx_file", help="Path to the input DOCX file") parser.add_argument("output_dir", nargs='?', default=".", help="Output directory (default: current directory)") args = parser.parse_args() docx_file = args.docx_file output_dir = args.output_dir # Create output directory if it doesn't exist if not os.path.exists(output_dir): os.makedirs(output_dir) # Generate MD filename based on DOCX filename docx_basename = os.path.splitext(os.path.basename(docx_file))[0] md_file = os.path.join(output_dir, docx_basename + ".md") if os.path.exists(docx_file): convert_docx_to_md(docx_file, md_file) print(f"Converted {docx_file} to {md_file}") else: print(f"File {docx_file} not found") exit(1)