318 lines
14 KiB
Python
318 lines
14 KiB
Python
import json
|
||
import os
|
||
import re
|
||
|
||
# --- Configuration ---
|
||
INPUT_DIR = "/Users/robertmaxwell/note/知识图谱/教科书-数学/all_副本/"
|
||
OUTPUT_SQL_FILE = os.path.join(INPUT_DIR, "knowledge_graph_inserts.sql")
|
||
|
||
# --- Helper Functions ---
|
||
def escape_sql_string(s):
|
||
"""Escapes single quotes in a string for SQL."""
|
||
if s is None:
|
||
return 'NULL'
|
||
# Replace single quotes with two single quotes for SQL escaping
|
||
# Also, handle potential triple single quotes if they somehow appear in data and conflict with f'''...'''
|
||
return f"'{str(s).replace("'", "''").replace("'''", "''''''")}'"
|
||
|
||
def to_sql_jsonb(data):
|
||
"""Converts a Python object to a SQL JSONB string."""
|
||
if data is None: # Handle None explicitly
|
||
return 'NULL'
|
||
# Ensure only JSON-serializable types are passed
|
||
try:
|
||
# json.dumps handles internal quotes, so we just need to escape the outer single quotes if any
|
||
json_str = json.dumps(data, ensure_ascii=False)
|
||
return escape_sql_string(json_str)
|
||
except TypeError:
|
||
print(f"Warning: Non-JSON serializable data found: {data}. Returning NULL.")
|
||
return 'NULL'
|
||
|
||
def to_sql_text_array(arr):
|
||
"""Converts a Python list of strings to a SQL TEXT[] array string."""
|
||
if not arr: # Handle empty list explicitly
|
||
return 'NULL'
|
||
# Filter out None values from the array if any, and ensure all are strings
|
||
clean_arr = [str(item) for item in arr if item is not None]
|
||
if not clean_arr:
|
||
return 'NULL'
|
||
escaped_elements = [escape_sql_string(item) for item in clean_arr]
|
||
return f"ARRAY[{', '.join(escaped_elements)}]"
|
||
|
||
def parse_id_for_chapter(item_id):
|
||
"""Parses the item ID to extract chapter information (if needed, though not used in current schema)."""
|
||
match = re.match(r'^[KMT](\d+)-(\d+)(?:-(\d+))?', item_id)
|
||
if match:
|
||
chapter_num = match.group(1)
|
||
section_num = match.group(2)
|
||
subsection_num = match.group(3) if match.group(3) else '0'
|
||
return f"Chapter {chapter_num}.{section_num}.{subsection_num}"
|
||
return "Unknown Chapter"
|
||
|
||
# --- Main Script Logic ---
|
||
def generate_sql_inserts():
|
||
sql_statements = []
|
||
|
||
# Add schema and table creation DDL
|
||
sql_statements.append('''
|
||
-- 首先,创建一个Schema来组织知识图谱相关的表,这是一种好的实践
|
||
CREATE SCHEMA IF NOT EXISTS kg;
|
||
|
||
-- 1. 知识点表 (Knowledge Table)
|
||
CREATE TABLE IF NOT EXISTS kg.knowledge (
|
||
id VARCHAR(50) PRIMARY KEY, -- 对应 "编号"
|
||
level VARCHAR(50), -- 对应 "层次"
|
||
name TEXT NOT NULL, -- 对应 "名称"
|
||
type VARCHAR(50), -- 对应 "类型"
|
||
core_content JSONB, -- 【JSONB】用于存储复杂的JSON对象
|
||
principle JSONB, -- 【JSONB】
|
||
conditions JSONB, -- 【JSONB】
|
||
prerequisites VARCHAR(50)[], -- 【TEXT[]】用于存储编号数组
|
||
related_content JSONB, -- 【JSONB】
|
||
importance VARCHAR(50), -- 对应 "重要程度"
|
||
exam_ways TEXT[] -- 【TEXT[]】用于存储字符串数组
|
||
);
|
||
|
||
-- 2. 方法表 (Method Table)
|
||
CREATE TABLE IF NOT EXISTS kg.methods (
|
||
id VARCHAR(50) PRIMARY KEY, -- 对应 "编号"
|
||
name TEXT NOT NULL, -- 对应 "名称"
|
||
type VARCHAR(50), -- 对应 "类型"
|
||
scenarios JSONB, -- 【JSONB】适用场景
|
||
steps JSONB, -- 【JSONB】方法步骤
|
||
math_ideas TEXT[], -- 【TEXT[]】数学思想
|
||
strategy TEXT, -- 解题策略
|
||
prerequisite_methods VARCHAR(50)[], -- 【TEXT[]】前置方法
|
||
common_errors JSONB, -- 【JSONB】常见错误
|
||
difficulty SMALLINT, -- 难度等级
|
||
location TEXT -- 教材位置
|
||
);
|
||
|
||
-- 3. 题目表 (Problem Table)
|
||
CREATE TABLE IF NOT EXISTS kg.problems (
|
||
id VARCHAR(50) PRIMARY KEY, -- 对应 "编号"
|
||
type VARCHAR(50), -- 题目类型
|
||
source_info JSONB, -- 【JSONB】来源信息
|
||
content JSONB, -- 【JSONB】题目内容
|
||
question_types JSONB, -- 【JSONB】题型分类
|
||
difficulty JSONB -- 【JSONB】难度评估
|
||
);
|
||
|
||
-- 4. 关联表 (Link Tables) - 用于处理多对多关系
|
||
-- 方法支撑的知识点
|
||
CREATE TABLE IF NOT EXISTS kg.method_knowledge_link (
|
||
method_id VARCHAR(50) REFERENCES kg.methods(id),
|
||
knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id),
|
||
PRIMARY KEY (method_id, knowledge_id)
|
||
);
|
||
|
||
-- 题目考查的知识点
|
||
CREATE TABLE IF NOT EXISTS kg.problem_knowledge_link (
|
||
problem_id VARCHAR(50) REFERENCES kg.problems(id),
|
||
knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id),
|
||
-- 可以增加一个字段表示是“主要考查”还是“辅助涉及”
|
||
relevance VARCHAR(20) DEFAULT '主要考查',
|
||
PRIMARY KEY (problem_id, knowledge_id, relevance)
|
||
);
|
||
|
||
-- 题目使用的方法
|
||
CREATE TABLE IF NOT EXISTS kg.problem_method_link (
|
||
problem_id VARCHAR(50) REFERENCES kg.problems(id),
|
||
method_id VARCHAR(50) REFERENCES kg.methods(id),
|
||
PRIMARY KEY (problem_id, method_id)
|
||
);
|
||
''')
|
||
sql_statements.append("\n-- --- INSERT DATA ---\n")
|
||
|
||
# Process Knowledge Points
|
||
knowledge_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("knowledge-") and f.endswith(".json")]
|
||
for filename in knowledge_files:
|
||
filepath = os.path.join(INPUT_DIR, filename)
|
||
print(f"Processing knowledge file: {filename}") # Debugging line
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
for item in data.get('knowledge_list', []):
|
||
item_id = item.get('编号')
|
||
if not item_id:
|
||
print(f"Warning: Skipping knowledge item in {filename} due to missing '编号'.")
|
||
continue
|
||
|
||
sql = f'''
|
||
INSERT INTO kg.knowledge (id, level, name, type, core_content, principle, conditions, prerequisites, related_content, importance, exam_ways)
|
||
VALUES (
|
||
{escape_sql_string(item_id)},
|
||
{escape_sql_string(item.get('层次'))},
|
||
{escape_sql_string(item.get('名称'))},
|
||
{escape_sql_string(item.get('类型'))},
|
||
{to_sql_jsonb(item.get('核心内容'))},
|
||
{to_sql_jsonb(item.get('原理说明'))},
|
||
{to_sql_jsonb(item.get('适用条件'))},
|
||
{to_sql_text_array(item.get('前置知识'))},
|
||
{to_sql_jsonb(item.get('关联内容'))},
|
||
{escape_sql_string(item.get('重要程度'))},
|
||
{to_sql_text_array(item.get('考查方式'))}
|
||
);
|
||
'''
|
||
sql_statements.append(sql)
|
||
except json.JSONDecodeError as e:
|
||
print(f"Error decoding JSON in {filename}: {e}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred while processing {filename}: {e}")
|
||
continue
|
||
|
||
# Process Methods
|
||
method_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("method-") and f.endswith(".json")]
|
||
for filename in method_files:
|
||
filepath = os.path.join(INPUT_DIR, filename)
|
||
print(f"Processing method file: {filename}") # Debugging line
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
for item in data.get('method_list', []):
|
||
item_id = item.get('编号')
|
||
if not item_id:
|
||
print(f"Warning: Skipping method item in {filename} due to missing '编号'.")
|
||
continue
|
||
|
||
sql = f'''
|
||
INSERT INTO kg.methods (id, name, type, scenarios, steps, math_ideas, strategy, prerequisite_methods, common_errors, difficulty, location)
|
||
VALUES (
|
||
{escape_sql_string(item_id)},
|
||
{escape_sql_string(item.get('名称'))},
|
||
{escape_sql_string(item.get('类型'))},
|
||
{to_sql_jsonb(item.get('适用场景'))},
|
||
{to_sql_jsonb(item.get('方法步骤'))},
|
||
{to_sql_text_array(item.get('数学思想'))},
|
||
{escape_sql_string(item.get('解题策略'))},
|
||
{to_sql_text_array(item.get('前置方法'))},
|
||
{to_sql_jsonb(item.get('常见错误'))},
|
||
{item.get('难度等级', 'NULL')},
|
||
{escape_sql_string(item.get('教材位置'))}
|
||
);
|
||
'''
|
||
sql_statements.append(sql)
|
||
except json.JSONDecodeError as e:
|
||
print(f"Error decoding JSON in {filename}: {e}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred while processing {filename}: {e}")
|
||
continue
|
||
|
||
# Process Problems and Link Tables
|
||
problem_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("problems-") and f.endswith(".json")]
|
||
for filename in problem_files:
|
||
filepath = os.path.join(INPUT_DIR, filename)
|
||
print(f"Processing problem file: {filename}") # Debugging line
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
for item in data.get('problem_list', []):
|
||
problem_id = item.get('题目基本信息', {}).get('编号')
|
||
if not problem_id:
|
||
print(f"Warning: Skipping problem item in {filename} due to missing '编号'.")
|
||
continue
|
||
|
||
# Insert into kg.problems
|
||
sql = f'''
|
||
INSERT INTO kg.problems (id, type, source_info, content, question_types, difficulty)
|
||
VALUES (
|
||
{escape_sql_string(problem_id)},
|
||
{escape_sql_string(item.get('题目基本信息', {}).get('题目类型'))},
|
||
{to_sql_jsonb(item.get('题目基本信息', {}).get('来源信息'))},
|
||
{to_sql_jsonb(item.get('题目内容'))},
|
||
{to_sql_jsonb(item.get('题型分类'))},
|
||
{to_sql_jsonb(item.get('难度评估'))}
|
||
);
|
||
'''
|
||
sql_statements.append(sql)
|
||
|
||
# Insert into kg.problem_knowledge_link
|
||
knowledge_annotations = item.get('知识点标注', {})
|
||
for sub_question_key, annotations in knowledge_annotations.items():
|
||
for main_kp in annotations.get('主要考查', []):
|
||
kp_id = main_kp.get('知识点编号')
|
||
if kp_id:
|
||
sql = f'''
|
||
INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance)
|
||
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '主要考查');
|
||
'''
|
||
sql_statements.append(sql)
|
||
for aux_kp in annotations.get('辅助涉及', []):
|
||
kp_id = aux_kp.get('知识点编号')
|
||
if kp_id:
|
||
sql = f'''
|
||
INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance)
|
||
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '辅助涉及');
|
||
'''
|
||
sql_statements.append(sql)
|
||
|
||
# Insert into kg.problem_method_link
|
||
method_annotations = item.get('方法标注', {})
|
||
for sub_question_key, methods_list in method_annotations.items():
|
||
for method in methods_list:
|
||
method_id = method.get('方法编号')
|
||
if method_id:
|
||
sql = f'''
|
||
INSERT INTO kg.problem_method_link (problem_id, method_id)
|
||
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(method_id)});'''
|
||
sql_statements.append(sql)
|
||
except json.JSONDecodeError as e:
|
||
print(f"Error decoding JSON in {filename}: {e}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred while processing {filename}: {e}")
|
||
continue
|
||
|
||
# Insert into kg.method_knowledge_link (from method files)
|
||
# This needs to be done after all knowledge points are inserted
|
||
# Re-process method files for links
|
||
for filename in method_files:
|
||
filepath = os.path.join(INPUT_DIR, filename)
|
||
print(f"Processing method file for links: {filename}") # Debugging line
|
||
try:
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
for item in data.get('method_list', []):
|
||
method_id = item.get('编号')
|
||
if not method_id:
|
||
continue
|
||
|
||
# "支撑知识点" in method JSON
|
||
supporting_kps = item.get('支撑知识点', [])
|
||
for kp_full_name in supporting_kps:
|
||
# Attempt to extract ID using regex for "ID NAME" format
|
||
match = re.match(r'^([K]\d+-\d+-\d+-\d+)', kp_full_name)
|
||
if match:
|
||
kp_id = match.group(1)
|
||
else:
|
||
# If regex doesn't match, assume it's already just the ID
|
||
kp_id = kp_full_name
|
||
|
||
# print(f"DEBUG from {filename}: kp_full_name='{kp_full_name}', kp_id='{kp_id}'") # DEBUG PRINT
|
||
if kp_id:
|
||
sql = f'''
|
||
INSERT INTO kg.method_knowledge_link (method_id, knowledge_id)
|
||
VALUES ({escape_sql_string(method_id)}, {escape_sql_string(kp_id)});
|
||
'''
|
||
sql_statements.append(sql)
|
||
else:
|
||
print(f"Warning: Could not extract knowledge ID from '{kp_full_name}' in method '{method_id}'.")
|
||
except json.JSONDecodeError as e:
|
||
print(f"Error decoding JSON for links in {filename}: {e}")
|
||
continue
|
||
except Exception as e:
|
||
print(f"An unexpected error occurred while processing links in {filename}: {e}")
|
||
continue
|
||
|
||
# Write all statements to the output file
|
||
with open(OUTPUT_SQL_FILE, 'w', encoding='utf-8') as f:
|
||
for stmt in sql_statements:
|
||
f.write(stmt.strip() + "\n\n")
|
||
|
||
print(f"SQL insert statements generated successfully to {OUTPUT_SQL_FILE}")
|
||
|
||
# Execute the generation
|
||
generate_sql_inserts()
|