note/知识图谱/教科书-数学/all_副本/generate_sql_inserts.py
2025-11-19 10:16:05 +08:00

318 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import re
# --- Configuration ---
INPUT_DIR = "/Users/robertmaxwell/note/知识图谱/教科书-数学/all_副本/"
OUTPUT_SQL_FILE = os.path.join(INPUT_DIR, "knowledge_graph_inserts.sql")
# --- Helper Functions ---
def escape_sql_string(s):
"""Escapes single quotes in a string for SQL."""
if s is None:
return 'NULL'
# Replace single quotes with two single quotes for SQL escaping
# Also, handle potential triple single quotes if they somehow appear in data and conflict with f'''...'''
return f"'{str(s).replace("'", "''").replace("'''", "''''''")}'"
def to_sql_jsonb(data):
"""Converts a Python object to a SQL JSONB string."""
if data is None: # Handle None explicitly
return 'NULL'
# Ensure only JSON-serializable types are passed
try:
# json.dumps handles internal quotes, so we just need to escape the outer single quotes if any
json_str = json.dumps(data, ensure_ascii=False)
return escape_sql_string(json_str)
except TypeError:
print(f"Warning: Non-JSON serializable data found: {data}. Returning NULL.")
return 'NULL'
def to_sql_text_array(arr):
"""Converts a Python list of strings to a SQL TEXT[] array string."""
if not arr: # Handle empty list explicitly
return 'NULL'
# Filter out None values from the array if any, and ensure all are strings
clean_arr = [str(item) for item in arr if item is not None]
if not clean_arr:
return 'NULL'
escaped_elements = [escape_sql_string(item) for item in clean_arr]
return f"ARRAY[{', '.join(escaped_elements)}]"
def parse_id_for_chapter(item_id):
"""Parses the item ID to extract chapter information (if needed, though not used in current schema)."""
match = re.match(r'^[KMT](\d+)-(\d+)(?:-(\d+))?', item_id)
if match:
chapter_num = match.group(1)
section_num = match.group(2)
subsection_num = match.group(3) if match.group(3) else '0'
return f"Chapter {chapter_num}.{section_num}.{subsection_num}"
return "Unknown Chapter"
# --- Main Script Logic ---
def generate_sql_inserts():
sql_statements = []
# Add schema and table creation DDL
sql_statements.append('''
-- 首先创建一个Schema来组织知识图谱相关的表这是一种好的实践
CREATE SCHEMA IF NOT EXISTS kg;
-- 1. 知识点表 (Knowledge Table)
CREATE TABLE IF NOT EXISTS kg.knowledge (
id VARCHAR(50) PRIMARY KEY, -- 对应 "编号"
level VARCHAR(50), -- 对应 "层次"
name TEXT NOT NULL, -- 对应 "名称"
type VARCHAR(50), -- 对应 "类型"
core_content JSONB, -- 【JSONB】用于存储复杂的JSON对象
principle JSONB, -- 【JSONB】
conditions JSONB, -- 【JSONB】
prerequisites VARCHAR(50)[], -- 【TEXT[]】用于存储编号数组
related_content JSONB, -- 【JSONB】
importance VARCHAR(50), -- 对应 "重要程度"
exam_ways TEXT[] -- 【TEXT[]】用于存储字符串数组
);
-- 2. 方法表 (Method Table)
CREATE TABLE IF NOT EXISTS kg.methods (
id VARCHAR(50) PRIMARY KEY, -- 对应 "编号"
name TEXT NOT NULL, -- 对应 "名称"
type VARCHAR(50), -- 对应 "类型"
scenarios JSONB, -- 【JSONB】适用场景
steps JSONB, -- 【JSONB】方法步骤
math_ideas TEXT[], -- 【TEXT[]】数学思想
strategy TEXT, -- 解题策略
prerequisite_methods VARCHAR(50)[], -- 【TEXT[]】前置方法
common_errors JSONB, -- 【JSONB】常见错误
difficulty SMALLINT, -- 难度等级
location TEXT -- 教材位置
);
-- 3. 题目表 (Problem Table)
CREATE TABLE IF NOT EXISTS kg.problems (
id VARCHAR(50) PRIMARY KEY, -- 对应 "编号"
type VARCHAR(50), -- 题目类型
source_info JSONB, -- 【JSONB】来源信息
content JSONB, -- 【JSONB】题目内容
question_types JSONB, -- 【JSONB】题型分类
difficulty JSONB -- 【JSONB】难度评估
);
-- 4. 关联表 (Link Tables) - 用于处理多对多关系
-- 方法支撑的知识点
CREATE TABLE IF NOT EXISTS kg.method_knowledge_link (
method_id VARCHAR(50) REFERENCES kg.methods(id),
knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id),
PRIMARY KEY (method_id, knowledge_id)
);
-- 题目考查的知识点
CREATE TABLE IF NOT EXISTS kg.problem_knowledge_link (
problem_id VARCHAR(50) REFERENCES kg.problems(id),
knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id),
-- 可以增加一个字段表示是“主要考查”还是“辅助涉及”
relevance VARCHAR(20) DEFAULT '主要考查',
PRIMARY KEY (problem_id, knowledge_id, relevance)
);
-- 题目使用的方法
CREATE TABLE IF NOT EXISTS kg.problem_method_link (
problem_id VARCHAR(50) REFERENCES kg.problems(id),
method_id VARCHAR(50) REFERENCES kg.methods(id),
PRIMARY KEY (problem_id, method_id)
);
''')
sql_statements.append("\n-- --- INSERT DATA ---\n")
# Process Knowledge Points
knowledge_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("knowledge-") and f.endswith(".json")]
for filename in knowledge_files:
filepath = os.path.join(INPUT_DIR, filename)
print(f"Processing knowledge file: {filename}") # Debugging line
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data.get('knowledge_list', []):
item_id = item.get('编号')
if not item_id:
print(f"Warning: Skipping knowledge item in {filename} due to missing '编号'.")
continue
sql = f'''
INSERT INTO kg.knowledge (id, level, name, type, core_content, principle, conditions, prerequisites, related_content, importance, exam_ways)
VALUES (
{escape_sql_string(item_id)},
{escape_sql_string(item.get('层次'))},
{escape_sql_string(item.get('名称'))},
{escape_sql_string(item.get('类型'))},
{to_sql_jsonb(item.get('核心内容'))},
{to_sql_jsonb(item.get('原理说明'))},
{to_sql_jsonb(item.get('适用条件'))},
{to_sql_text_array(item.get('前置知识'))},
{to_sql_jsonb(item.get('关联内容'))},
{escape_sql_string(item.get('重要程度'))},
{to_sql_text_array(item.get('考查方式'))}
);
'''
sql_statements.append(sql)
except json.JSONDecodeError as e:
print(f"Error decoding JSON in {filename}: {e}")
continue
except Exception as e:
print(f"An unexpected error occurred while processing {filename}: {e}")
continue
# Process Methods
method_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("method-") and f.endswith(".json")]
for filename in method_files:
filepath = os.path.join(INPUT_DIR, filename)
print(f"Processing method file: {filename}") # Debugging line
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data.get('method_list', []):
item_id = item.get('编号')
if not item_id:
print(f"Warning: Skipping method item in {filename} due to missing '编号'.")
continue
sql = f'''
INSERT INTO kg.methods (id, name, type, scenarios, steps, math_ideas, strategy, prerequisite_methods, common_errors, difficulty, location)
VALUES (
{escape_sql_string(item_id)},
{escape_sql_string(item.get('名称'))},
{escape_sql_string(item.get('类型'))},
{to_sql_jsonb(item.get('适用场景'))},
{to_sql_jsonb(item.get('方法步骤'))},
{to_sql_text_array(item.get('数学思想'))},
{escape_sql_string(item.get('解题策略'))},
{to_sql_text_array(item.get('前置方法'))},
{to_sql_jsonb(item.get('常见错误'))},
{item.get('难度等级', 'NULL')},
{escape_sql_string(item.get('教材位置'))}
);
'''
sql_statements.append(sql)
except json.JSONDecodeError as e:
print(f"Error decoding JSON in {filename}: {e}")
continue
except Exception as e:
print(f"An unexpected error occurred while processing {filename}: {e}")
continue
# Process Problems and Link Tables
problem_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("problems-") and f.endswith(".json")]
for filename in problem_files:
filepath = os.path.join(INPUT_DIR, filename)
print(f"Processing problem file: {filename}") # Debugging line
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data.get('problem_list', []):
problem_id = item.get('题目基本信息', {}).get('编号')
if not problem_id:
print(f"Warning: Skipping problem item in {filename} due to missing '编号'.")
continue
# Insert into kg.problems
sql = f'''
INSERT INTO kg.problems (id, type, source_info, content, question_types, difficulty)
VALUES (
{escape_sql_string(problem_id)},
{escape_sql_string(item.get('题目基本信息', {}).get('题目类型'))},
{to_sql_jsonb(item.get('题目基本信息', {}).get('来源信息'))},
{to_sql_jsonb(item.get('题目内容'))},
{to_sql_jsonb(item.get('题型分类'))},
{to_sql_jsonb(item.get('难度评估'))}
);
'''
sql_statements.append(sql)
# Insert into kg.problem_knowledge_link
knowledge_annotations = item.get('知识点标注', {})
for sub_question_key, annotations in knowledge_annotations.items():
for main_kp in annotations.get('主要考查', []):
kp_id = main_kp.get('知识点编号')
if kp_id:
sql = f'''
INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance)
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '主要考查');
'''
sql_statements.append(sql)
for aux_kp in annotations.get('辅助涉及', []):
kp_id = aux_kp.get('知识点编号')
if kp_id:
sql = f'''
INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance)
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '辅助涉及');
'''
sql_statements.append(sql)
# Insert into kg.problem_method_link
method_annotations = item.get('方法标注', {})
for sub_question_key, methods_list in method_annotations.items():
for method in methods_list:
method_id = method.get('方法编号')
if method_id:
sql = f'''
INSERT INTO kg.problem_method_link (problem_id, method_id)
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(method_id)});'''
sql_statements.append(sql)
except json.JSONDecodeError as e:
print(f"Error decoding JSON in {filename}: {e}")
continue
except Exception as e:
print(f"An unexpected error occurred while processing {filename}: {e}")
continue
# Insert into kg.method_knowledge_link (from method files)
# This needs to be done after all knowledge points are inserted
# Re-process method files for links
for filename in method_files:
filepath = os.path.join(INPUT_DIR, filename)
print(f"Processing method file for links: {filename}") # Debugging line
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data.get('method_list', []):
method_id = item.get('编号')
if not method_id:
continue
# "支撑知识点" in method JSON
supporting_kps = item.get('支撑知识点', [])
for kp_full_name in supporting_kps:
# Attempt to extract ID using regex for "ID NAME" format
match = re.match(r'^([K]\d+-\d+-\d+-\d+)', kp_full_name)
if match:
kp_id = match.group(1)
else:
# If regex doesn't match, assume it's already just the ID
kp_id = kp_full_name
# print(f"DEBUG from {filename}: kp_full_name='{kp_full_name}', kp_id='{kp_id}'") # DEBUG PRINT
if kp_id:
sql = f'''
INSERT INTO kg.method_knowledge_link (method_id, knowledge_id)
VALUES ({escape_sql_string(method_id)}, {escape_sql_string(kp_id)});
'''
sql_statements.append(sql)
else:
print(f"Warning: Could not extract knowledge ID from '{kp_full_name}' in method '{method_id}'.")
except json.JSONDecodeError as e:
print(f"Error decoding JSON for links in {filename}: {e}")
continue
except Exception as e:
print(f"An unexpected error occurred while processing links in {filename}: {e}")
continue
# Write all statements to the output file
with open(OUTPUT_SQL_FILE, 'w', encoding='utf-8') as f:
for stmt in sql_statements:
f.write(stmt.strip() + "\n\n")
print(f"SQL insert statements generated successfully to {OUTPUT_SQL_FILE}")
# Execute the generation
generate_sql_inserts()