note/知识图谱/教科书-数学/knowledge/fix_json_errors.py
2025-11-19 10:16:05 +08:00

183 lines
6.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import json
import re
def fix_json_escape_errors(file_path):
"""修复JSON文件中的反斜杠转义字符错误"""
try:
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 修复常见的LaTeX数学符号转义错误
# 将单独的反斜杠替换为双反斜杠,但注意不要影响已经正确的转义序列
fixed_content = content
# 修复数学模式中的反斜杠
# 例如: \frac 应该保持不变,但 \ 可能需要转义
# 这里我们修复一些常见的错误模式
# 模式1: 在JSON字符串值中单独的反斜杠后面跟着非转义字符
# 我们要找到JSON字符串中的反斜杠并修复它们
in_string = False
escape_next = False
result = []
i = 0
while i < len(fixed_content):
char = fixed_content[i]
if escape_next:
result.append(char)
escape_next = False
i += 1
continue
if char == '\\':
# 检查是否在字符串中
if not in_string:
# 不在字符串中这是JSON转义保持不变
result.append(char)
i += 1
continue
# 在字符串中,检查下一个字符
if i + 1 < len(fixed_content):
next_char = fixed_content[i + 1]
# 如果已经是合法的JSON转义序列保持不变
if next_char in '"\\nrtbf/':
result.append(char)
i += 1
continue
# 如果是LaTeX命令可能需要转义
# 检查是否是LaTeX命令的开始
laTeX_commands = ['frac', 'sqrt', 'sin', 'cos', 'tan', 'log', 'ln', 'alpha', 'beta', 'gamma', 'delta', 'theta', 'lambda', 'mu', 'pi', 'sigma', 'phi', 'omega']
# 尝试匹配LaTeX命令
matched_cmd = None
for cmd in laTeX_commands:
if fixed_content[i+1:i+1+len(cmd)] == cmd:
matched_cmd = cmd
break
if matched_cmd:
# 这是LaTeX命令保持反斜杠
result.append(char)
i += 1
continue
# 其他情况,可能需要转义反斜杠
# 检查这个反斜杠是否已经被正确转义
if i > 0 and fixed_content[i-1] != '\\':
# 需要转义
result.append('\\\\')
else:
result.append(char)
else:
# 反斜杠在字符串末尾,需要转义
result.append('\\\\')
elif char == '"' and not escape_next:
# 字符串开始或结束
in_string = not in_string
result.append(char)
else:
result.append(char)
i += 1
fixed_content = ''.join(result)
# 尝试解析修复后的内容
try:
json.loads(fixed_content)
print(f"{file_path} - 修复成功")
# 写回文件
with open(file_path, 'w', encoding='utf-8') as f:
f.write(fixed_content)
return True
except json.JSONDecodeError as e:
print(f"{file_path} - 修复失败: {str(e)}")
# 如果自动修复失败,尝试更简单的方法
return fix_json_simple(file_path)
except Exception as e:
print(f"{file_path} - 处理出错: {str(e)}")
return False
def fix_json_simple(file_path):
"""简单的JSON修复方法"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 简单替换常见的错误模式
# 将单独的反斜杠不是LaTeX命令替换为双反斜杠
# 这是一个启发式方法,可能不完美
# 首先找到所有数学表达式
# 使用正则表达式来查找$...$和\(...\)模式
math_patterns = []
# 查找行内数学模式 $...$
pattern_inline = r'\$([^$]*)\$'
matches = re.finditer(pattern_inline, content)
for match in matches:
math_patterns.append((match.start(), match.end(), match.group()))
# 查找显示数学模式 \(...\)
pattern_display = r'\\\(([^)]*)\\\)'
matches = re.finditer(pattern_display, content)
for match in matches:
math_patterns.append((match.start(), match.end(), match.group()))
# 对每个数学表达式进行修复
fixed_content = content
offset = 0
for start, end, math_expr in sorted(math_patterns):
# 提取数学内容
math_content = math_expr[1:-1] if math_expr.startswith('$') else math_expr[2:-2]
# 修复数学内容中的反斜杠
fixed_math_content = math_content
# 这里可以添加更复杂的修复逻辑
# 暂时保持原样因为LaTeX命令中的反斜杠通常是正确的
# 替换回原内容
if math_expr.startswith('$'):
replacement = f'${fixed_math_content}$'
else:
replacement = f'\\({fixed_math_content}\\)'
fixed_content = fixed_content[:start+offset] + replacement + fixed_content[end+offset:]
offset += len(replacement) - len(math_expr)
# 尝试解析
json.loads(fixed_content)
print(f"{file_path} - 简单修复成功")
with open(file_path, 'w', encoding='utf-8') as f:
f.write(fixed_content)
return True
except Exception as e:
print(f"{file_path} - 简单修复也失败: {str(e)}")
return False
if __name__ == "__main__":
files_to_fix = [
'knowledge-选择性必修第一章-空间向量与立体几何.json',
'knowledge-选择性必修第三章-圆锥曲线的方程.json',
'knowledge-选择性必修第二章-直线和圆的方程.json',
'knowledge-选择性必修第五章-一元函数的导数及其应用.json',
'knowledge-选择性必修第八章-成对数据的统计分析.json',
'knowledge-选择性必修第六章-计数原理.json',
'knowledge-选择性必修第四章-数列.json'
]
for file_path in files_to_fix:
fix_json_escape_errors(file_path)