306 lines
12 KiB
Python
306 lines
12 KiB
Python
import docx
|
|
import os
|
|
import argparse
|
|
from docx.shared import Inches
|
|
from docx.enum.text import WD_COLOR_INDEX
|
|
from docx.oxml.shared import qn
|
|
from docx.oxml import OxmlElement
|
|
import re
|
|
|
|
def get_used_outline_levels(doc):
|
|
"""Get all outline levels that are actually used in the document paragraphs"""
|
|
outline_levels = set()
|
|
|
|
for paragraph in doc.paragraphs:
|
|
try:
|
|
# Check if paragraph has outline level defined
|
|
if paragraph.style._element.pPr is not None and paragraph.style._element.pPr.outlineLvl is not None:
|
|
level = paragraph.style._element.pPr.outlineLvl.val
|
|
outline_levels.add(level)
|
|
except AttributeError:
|
|
pass
|
|
|
|
return sorted(outline_levels)
|
|
|
|
def get_heading_level_from_style(style_name):
|
|
"""Extract heading level from style name, supporting both English and Chinese styles"""
|
|
# Check for patterns like "Heading 1", "标题 1", "标题1", etc.
|
|
level_match = re.search(r'[Hh]eading\s*(\d+)|[标题標題]\s*(\d+)|[标题标题]\s*(\d+)', style_name)
|
|
if level_match:
|
|
return int(level_match.group(1) or level_match.group(2) or level_match.group(3))
|
|
|
|
# Check for patterns like "Heading1", "标题1", etc. (no space)
|
|
level_match = re.search(r'[Hh]eading(\d+)|[标题標題](\d+)|[标题标题](\d+)', style_name)
|
|
if level_match:
|
|
return int(level_match.group(1) or level_match.group(2) or level_match.group(3))
|
|
|
|
return None
|
|
|
|
def map_outline_levels_to_markdown_levels(outline_levels):
|
|
"""Map document outline levels to markdown heading levels (highest = #, next = ##, etc.)"""
|
|
if not outline_levels:
|
|
return {}
|
|
|
|
# Map outline levels to markdown levels (lowest outline level value = highest heading level)
|
|
# In Word, outline level 0 is the highest, level 1 is next, etc.
|
|
level_mapping = {}
|
|
for i, level in enumerate(sorted(outline_levels)):
|
|
level_mapping[level] = i + 1
|
|
|
|
return level_mapping
|
|
|
|
def convert_docx_to_md(docx_path, md_path):
|
|
"""
|
|
Convert a DOCX file to Markdown format.
|
|
|
|
Args:
|
|
docx_path (str): Path to the input DOCX file
|
|
md_path (str): Path to the output MD file
|
|
"""
|
|
# Load the document
|
|
doc = docx.Document(docx_path)
|
|
|
|
# Create directory for images if it doesn't exist
|
|
md_dir = os.path.dirname(md_path)
|
|
images_dir = os.path.join(md_dir, "images")
|
|
if not os.path.exists(images_dir):
|
|
os.makedirs(images_dir)
|
|
|
|
md_content = []
|
|
image_count = 1
|
|
|
|
# Extract all images first and create a mapping
|
|
image_mapping = {}
|
|
for rel in doc.part.rels.values():
|
|
if "image" in rel.target_ref:
|
|
image = rel.target_part.blob
|
|
image_filename = f"image_{image_count}.png"
|
|
image_path = os.path.join(images_dir, image_filename)
|
|
|
|
with open(image_path, "wb") as f:
|
|
f.write(image)
|
|
|
|
# Store the relationship ID and image filename
|
|
image_mapping[rel.rId] = image_filename
|
|
image_count += 1
|
|
|
|
# Get outline levels that are actually used in the document and create mapping to markdown levels
|
|
used_outline_levels = get_used_outline_levels(doc)
|
|
level_mapping = map_outline_levels_to_markdown_levels(used_outline_levels)
|
|
|
|
# Print debug information
|
|
print(f"Used outline levels in document: {used_outline_levels}")
|
|
print(f"Mapping to Markdown levels: {level_mapping}")
|
|
|
|
# Create a more sophisticated approach to handle document structure
|
|
# We'll iterate through the document's XML elements to preserve order
|
|
|
|
# Get all body elements in order
|
|
body_elements = doc.element.body.xpath('./*')
|
|
|
|
# Keep track of which tables we've processed
|
|
processed_tables = set()
|
|
|
|
# Process each element in order
|
|
table_counter = 0
|
|
for element in body_elements:
|
|
# Check if it's a paragraph
|
|
if element.tag.endswith('p'):
|
|
# Convert to paragraph object
|
|
para = docx.text.paragraph.Paragraph(element, doc)
|
|
|
|
# Handle headings based on outline level or style name
|
|
md_heading_level = None
|
|
|
|
# First, try to get outline level from the paragraph's style
|
|
try:
|
|
if para.style._element.pPr is not None and para.style._element.pPr.outlineLvl is not None:
|
|
outline_level = para.style._element.pPr.outlineLvl.val
|
|
# Map to markdown level
|
|
if outline_level in level_mapping:
|
|
md_heading_level = level_mapping[outline_level]
|
|
except AttributeError:
|
|
pass
|
|
|
|
# If we can't get outline level, try to extract from style name
|
|
if md_heading_level is None:
|
|
style_level = get_heading_level_from_style(para.style.name)
|
|
if style_level is not None:
|
|
# For style-based levels, we'll map them directly but cap at reasonable levels
|
|
md_heading_level = min(style_level, 6) # Markdown supports up to 6 levels
|
|
|
|
if md_heading_level is not None:
|
|
# Convert to Markdown heading
|
|
md_content.append('#' * md_heading_level + ' ' + para.text + '\n')
|
|
else:
|
|
# Process runs for formatting
|
|
para_content = ""
|
|
for run in para.runs:
|
|
text = run.text
|
|
|
|
# Skip empty text
|
|
if not text:
|
|
continue
|
|
|
|
# Handle bold
|
|
if run.bold:
|
|
text = f"**{text}**"
|
|
# Handle italic
|
|
if run.italic:
|
|
text = f"*{text}*"
|
|
# Handle underline (not standard in MD, using emphasis)
|
|
if run.underline:
|
|
text = f"*{text}*"
|
|
# Handle strikethrough
|
|
if run.font.strike:
|
|
text = f"~~{text}~~"
|
|
# Handle highlight (convert to bold as approximation)
|
|
if run.font.highlight_color and run.font.highlight_color != WD_COLOR_INDEX.NONE:
|
|
text = f"**{text}**"
|
|
|
|
para_content += text
|
|
|
|
# Check for inline images in this paragraph
|
|
inline_images = []
|
|
# Look for drawing elements in the paragraph
|
|
drawing_elements = para._element.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
|
|
|
|
for drawing in drawing_elements:
|
|
# Find the blip (image) element
|
|
blip_elements = drawing.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}blip')
|
|
|
|
for blip in blip_elements:
|
|
# Get the embed attribute which references the image relationship
|
|
rId = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
|
|
if rId and rId in image_mapping:
|
|
inline_images.append(image_mapping[rId])
|
|
|
|
# Add paragraph content
|
|
if para_content.strip() or inline_images:
|
|
# Add the paragraph text
|
|
if para_content.strip():
|
|
md_content.append(para_content + '\n')
|
|
|
|
# Add inline images that belong to this paragraph
|
|
for image_filename in inline_images:
|
|
md_content.append(f"\n\n")
|
|
|
|
# Check if it's a table
|
|
elif element.tag.endswith('tbl'):
|
|
# Find the corresponding table object
|
|
for i, table in enumerate(doc.tables):
|
|
if i not in processed_tables and table._element.xml == element.xml:
|
|
table_counter += 1
|
|
md_table = convert_table_to_md(table)
|
|
md_content.append(f"\n<!-- Table {table_counter} -->\n")
|
|
md_content.append(md_table)
|
|
processed_tables.add(i)
|
|
break
|
|
|
|
# Write to file
|
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
f.write('\n'.join(md_content))
|
|
|
|
def convert_table_to_md(table):
|
|
"""
|
|
Convert a DOCX table to Markdown format.
|
|
|
|
Args:
|
|
table: A python-docx table object
|
|
|
|
Returns:
|
|
str: Markdown formatted table
|
|
"""
|
|
md_table = []
|
|
|
|
# Process all rows to find max cells per row
|
|
rows_data = []
|
|
max_cells = 0
|
|
|
|
for row in table.rows:
|
|
row_data = []
|
|
for cell in row.cells:
|
|
# Clean up cell text
|
|
cell_text = cell.text.strip().replace('\n', '<br>')
|
|
row_data.append(cell_text)
|
|
rows_data.append(row_data)
|
|
max_cells = max(max_cells, len(row_data))
|
|
|
|
# Ensure all rows have the same number of cells
|
|
for row_data in rows_data:
|
|
while len(row_data) < max_cells:
|
|
row_data.append("")
|
|
|
|
# Process header row
|
|
if rows_data:
|
|
header = "| " + " | ".join(rows_data[0]) + " |"
|
|
md_table.append(header)
|
|
|
|
# Add separator row
|
|
separator = "| " + " | ".join(["---" for _ in range(max_cells)]) + " |"
|
|
md_table.append(separator)
|
|
|
|
# Process data rows
|
|
for row_data in rows_data[1:]:
|
|
row_str = "| " + " | ".join(row_data) + " |"
|
|
md_table.append(row_str)
|
|
|
|
md_table.append("") # Add blank line after table
|
|
return "\n".join(md_table)
|
|
|
|
def extract_images_from_docx(docx_path, images_dir):
|
|
"""
|
|
Extract images from a DOCX file to a specified directory.
|
|
|
|
Args:
|
|
docx_path (str): Path to the DOCX file
|
|
images_dir (str): Directory to save images
|
|
|
|
Returns:
|
|
list: List of image filenames
|
|
"""
|
|
doc = docx.Document(docx_path)
|
|
image_filenames = []
|
|
|
|
if not os.path.exists(images_dir):
|
|
os.makedirs(images_dir)
|
|
|
|
image_count = 1
|
|
for rel in doc.part.rels.values():
|
|
if "image" in rel.target_ref:
|
|
image = rel.target_part.blob
|
|
image_filename = f"image_{image_count}.png"
|
|
image_path = os.path.join(images_dir, image_filename)
|
|
|
|
with open(image_path, "wb") as f:
|
|
f.write(image)
|
|
|
|
image_filenames.append(image_filename)
|
|
image_count += 1
|
|
|
|
return image_filenames
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Convert DOCX file to Markdown format")
|
|
parser.add_argument("docx_file", help="Path to the input DOCX file")
|
|
parser.add_argument("output_dir", nargs='?', default=".", help="Output directory (default: current directory)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
docx_file = args.docx_file
|
|
output_dir = args.output_dir
|
|
|
|
# Create output directory if it doesn't exist
|
|
if not os.path.exists(output_dir):
|
|
os.makedirs(output_dir)
|
|
|
|
# Generate MD filename based on DOCX filename
|
|
docx_basename = os.path.splitext(os.path.basename(docx_file))[0]
|
|
md_file = os.path.join(output_dir, docx_basename + ".md")
|
|
|
|
if os.path.exists(docx_file):
|
|
convert_docx_to_md(docx_file, md_file)
|
|
print(f"Converted {docx_file} to {md_file}")
|
|
else:
|
|
print(f"File {docx_file} not found")
|
|
exit(1) |