183 lines
6.7 KiB
Python
183 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
||
import json
|
||
import re
|
||
|
||
def fix_json_escape_errors(file_path):
|
||
"""修复JSON文件中的反斜杠转义字符错误"""
|
||
try:
|
||
# 读取文件内容
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 修复常见的LaTeX数学符号转义错误
|
||
# 将单独的反斜杠替换为双反斜杠,但注意不要影响已经正确的转义序列
|
||
fixed_content = content
|
||
|
||
# 修复数学模式中的反斜杠
|
||
# 例如: \frac 应该保持不变,但 \ 可能需要转义
|
||
# 这里我们修复一些常见的错误模式
|
||
|
||
# 模式1: 在JSON字符串值中,单独的反斜杠后面跟着非转义字符
|
||
# 我们要找到JSON字符串中的反斜杠并修复它们
|
||
in_string = False
|
||
escape_next = False
|
||
result = []
|
||
i = 0
|
||
|
||
while i < len(fixed_content):
|
||
char = fixed_content[i]
|
||
|
||
if escape_next:
|
||
result.append(char)
|
||
escape_next = False
|
||
i += 1
|
||
continue
|
||
|
||
if char == '\\':
|
||
# 检查是否在字符串中
|
||
if not in_string:
|
||
# 不在字符串中,这是JSON转义,保持不变
|
||
result.append(char)
|
||
i += 1
|
||
continue
|
||
|
||
# 在字符串中,检查下一个字符
|
||
if i + 1 < len(fixed_content):
|
||
next_char = fixed_content[i + 1]
|
||
|
||
# 如果已经是合法的JSON转义序列,保持不变
|
||
if next_char in '"\\nrtbf/':
|
||
result.append(char)
|
||
i += 1
|
||
continue
|
||
|
||
# 如果是LaTeX命令,可能需要转义
|
||
# 检查是否是LaTeX命令的开始
|
||
laTeX_commands = ['frac', 'sqrt', 'sin', 'cos', 'tan', 'log', 'ln', 'alpha', 'beta', 'gamma', 'delta', 'theta', 'lambda', 'mu', 'pi', 'sigma', 'phi', 'omega']
|
||
|
||
# 尝试匹配LaTeX命令
|
||
matched_cmd = None
|
||
for cmd in laTeX_commands:
|
||
if fixed_content[i+1:i+1+len(cmd)] == cmd:
|
||
matched_cmd = cmd
|
||
break
|
||
|
||
if matched_cmd:
|
||
# 这是LaTeX命令,保持反斜杠
|
||
result.append(char)
|
||
i += 1
|
||
continue
|
||
|
||
# 其他情况,可能需要转义反斜杠
|
||
# 检查这个反斜杠是否已经被正确转义
|
||
if i > 0 and fixed_content[i-1] != '\\':
|
||
# 需要转义
|
||
result.append('\\\\')
|
||
else:
|
||
result.append(char)
|
||
else:
|
||
# 反斜杠在字符串末尾,需要转义
|
||
result.append('\\\\')
|
||
|
||
elif char == '"' and not escape_next:
|
||
# 字符串开始或结束
|
||
in_string = not in_string
|
||
result.append(char)
|
||
|
||
else:
|
||
result.append(char)
|
||
|
||
i += 1
|
||
|
||
fixed_content = ''.join(result)
|
||
|
||
# 尝试解析修复后的内容
|
||
try:
|
||
json.loads(fixed_content)
|
||
print(f"✓ {file_path} - 修复成功")
|
||
# 写回文件
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(fixed_content)
|
||
return True
|
||
except json.JSONDecodeError as e:
|
||
print(f"✗ {file_path} - 修复失败: {str(e)}")
|
||
# 如果自动修复失败,尝试更简单的方法
|
||
return fix_json_simple(file_path)
|
||
|
||
except Exception as e:
|
||
print(f"✗ {file_path} - 处理出错: {str(e)}")
|
||
return False
|
||
|
||
def fix_json_simple(file_path):
|
||
"""简单的JSON修复方法"""
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 简单替换常见的错误模式
|
||
# 将单独的反斜杠(不是LaTeX命令)替换为双反斜杠
|
||
# 这是一个启发式方法,可能不完美
|
||
|
||
# 首先找到所有数学表达式
|
||
# 使用正则表达式来查找$...$和\(...\)模式
|
||
math_patterns = []
|
||
|
||
# 查找行内数学模式 $...$
|
||
pattern_inline = r'\$([^$]*)\$'
|
||
matches = re.finditer(pattern_inline, content)
|
||
for match in matches:
|
||
math_patterns.append((match.start(), match.end(), match.group()))
|
||
|
||
# 查找显示数学模式 \(...\)
|
||
pattern_display = r'\\\(([^)]*)\\\)'
|
||
matches = re.finditer(pattern_display, content)
|
||
for match in matches:
|
||
math_patterns.append((match.start(), match.end(), match.group()))
|
||
|
||
# 对每个数学表达式进行修复
|
||
fixed_content = content
|
||
offset = 0
|
||
|
||
for start, end, math_expr in sorted(math_patterns):
|
||
# 提取数学内容
|
||
math_content = math_expr[1:-1] if math_expr.startswith('$') else math_expr[2:-2]
|
||
|
||
# 修复数学内容中的反斜杠
|
||
fixed_math_content = math_content
|
||
|
||
# 这里可以添加更复杂的修复逻辑
|
||
# 暂时保持原样,因为LaTeX命令中的反斜杠通常是正确的
|
||
|
||
# 替换回原内容
|
||
if math_expr.startswith('$'):
|
||
replacement = f'${fixed_math_content}$'
|
||
else:
|
||
replacement = f'\\({fixed_math_content}\\)'
|
||
|
||
fixed_content = fixed_content[:start+offset] + replacement + fixed_content[end+offset:]
|
||
offset += len(replacement) - len(math_expr)
|
||
|
||
# 尝试解析
|
||
json.loads(fixed_content)
|
||
print(f"✓ {file_path} - 简单修复成功")
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(fixed_content)
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"✗ {file_path} - 简单修复也失败: {str(e)}")
|
||
return False
|
||
|
||
if __name__ == "__main__":
|
||
files_to_fix = [
|
||
'knowledge-选择性必修第一章-空间向量与立体几何.json',
|
||
'knowledge-选择性必修第三章-圆锥曲线的方程.json',
|
||
'knowledge-选择性必修第二章-直线和圆的方程.json',
|
||
'knowledge-选择性必修第五章-一元函数的导数及其应用.json',
|
||
'knowledge-选择性必修第八章-成对数据的统计分析.json',
|
||
'knowledge-选择性必修第六章-计数原理.json',
|
||
'knowledge-选择性必修第四章-数列.json'
|
||
]
|
||
|
||
for file_path in files_to_fix:
|
||
fix_json_escape_errors(file_path) |