note/知识图谱/教科书-数学/all_副本/generate_sql_inserts.py

import json
import os
import re

# --- Configuration ---
INPUT_DIR = "/Users/robertmaxwell/note/知识图谱/教科书-数学/all_副本/"
OUTPUT_SQL_FILE = os.path.join(INPUT_DIR, "knowledge_graph_inserts.sql")

# --- Helper Functions ---
def escape_sql_string(s):
    """Escapes single quotes in a string for SQL."""
    if s is None:
        return 'NULL'
    # Replace single quotes with two single quotes for SQL escaping
    # Also, handle potential triple single quotes if they somehow appear in data and conflict with f'''...'''
    return f"'{str(s).replace("'", "''").replace("'''", "''''''")}'"

def to_sql_jsonb(data):
    """Converts a Python object to a SQL JSONB string."""
    if data is None: # Handle None explicitly
        return 'NULL'
    # Ensure only JSON-serializable types are passed
    try:
        # json.dumps handles internal quotes, so we just need to escape the outer single quotes if any
        json_str = json.dumps(data, ensure_ascii=False)
        return escape_sql_string(json_str)
    except TypeError:
        print(f"Warning: Non-JSON serializable data found: {data}. Returning NULL.")
        return 'NULL'

def to_sql_text_array(arr):
    """Converts a Python list of strings to a SQL TEXT[] array string."""
    if not arr: # Handle empty list explicitly
        return 'NULL'
    # Filter out None values from the array if any, and ensure all are strings
    clean_arr = [str(item) for item in arr if item is not None]
    if not clean_arr:
        return 'NULL'
    escaped_elements = [escape_sql_string(item) for item in clean_arr]
    return f"ARRAY[{', '.join(escaped_elements)}]"

def parse_id_for_chapter(item_id):
    """Parses the item ID to extract chapter information (if needed, though not used in current schema)."""
    match = re.match(r'^[KMT](\d+)-(\d+)(?:-(\d+))?', item_id)
    if match:
        chapter_num = match.group(1)
        section_num = match.group(2)
        subsection_num = match.group(3) if match.group(3) else '0'
        return f"Chapter {chapter_num}.{section_num}.{subsection_num}"
    return "Unknown Chapter"

# --- Main Script Logic ---
def generate_sql_inserts():
    sql_statements = []

    # Add schema and table creation DDL
    sql_statements.append('''
-- 首先，创建一个Schema来组织知识图谱相关的表，这是一种好的实践
CREATE SCHEMA IF NOT EXISTS kg;

-- 1. 知识点表 (Knowledge Table)
CREATE TABLE IF NOT EXISTS kg.knowledge (
    id VARCHAR(50) PRIMARY KEY,       -- 对应 "编号"
    level VARCHAR(50),                 -- 对应 "层次"
    name TEXT NOT NULL,                -- 对应 "名称"
    type VARCHAR(50),                  -- 对应 "类型"
    core_content JSONB,                -- 【JSONB】用于存储复杂的JSON对象
    principle JSONB,                   -- 【JSONB】
    conditions JSONB,                  -- 【JSONB】
    prerequisites VARCHAR(50)[],       -- 【TEXT[]】用于存储编号数组
    related_content JSONB,             -- 【JSONB】
    importance VARCHAR(50),            -- 对应 "重要程度"
    exam_ways TEXT[]                   -- 【TEXT[]】用于存储字符串数组
);

-- 2. 方法表 (Method Table)
CREATE TABLE IF NOT EXISTS kg.methods (
    id VARCHAR(50) PRIMARY KEY,        -- 对应 "编号"
    name TEXT NOT NULL,                 -- 对应 "名称"
    type VARCHAR(50),                   -- 对应 "类型"
    scenarios JSONB,                    -- 【JSONB】适用场景
    steps JSONB,                        -- 【JSONB】方法步骤
    math_ideas TEXT[],                  -- 【TEXT[]】数学思想
    strategy TEXT,                      -- 解题策略
    prerequisite_methods VARCHAR(50)[], -- 【TEXT[]】前置方法
    common_errors JSONB,                -- 【JSONB】常见错误
    difficulty SMALLINT,                -- 难度等级
    location TEXT                       -- 教材位置
);

-- 3. 题目表 (Problem Table)
CREATE TABLE IF NOT EXISTS kg.problems (
    id VARCHAR(50) PRIMARY KEY,       -- 对应 "编号"
    type VARCHAR(50),                  -- 题目类型
    source_info JSONB,                 -- 【JSONB】来源信息
    content JSONB,                     -- 【JSONB】题目内容
    question_types JSONB,              -- 【JSONB】题型分类
    difficulty JSONB                   -- 【JSONB】难度评估
);

-- 4. 关联表 (Link Tables) - 用于处理多对多关系
-- 方法支撑的知识点
CREATE TABLE IF NOT EXISTS kg.method_knowledge_link (
    method_id VARCHAR(50) REFERENCES kg.methods(id),
    knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id),
    PRIMARY KEY (method_id, knowledge_id)
);

-- 题目考查的知识点
CREATE TABLE IF NOT EXISTS kg.problem_knowledge_link (
    problem_id VARCHAR(50) REFERENCES kg.problems(id),
    knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id),
    -- 可以增加一个字段表示是“主要考查”还是“辅助涉及”
    relevance VARCHAR(20) DEFAULT '主要考查',
    PRIMARY KEY (problem_id, knowledge_id, relevance)
);

-- 题目使用的方法
CREATE TABLE IF NOT EXISTS kg.problem_method_link (
    problem_id VARCHAR(50) REFERENCES kg.problems(id),
    method_id VARCHAR(50) REFERENCES kg.methods(id),
    PRIMARY KEY (problem_id, method_id)
);
''')
    sql_statements.append("\n-- --- INSERT DATA ---\n")

    # Process Knowledge Points
    knowledge_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("knowledge-") and f.endswith(".json")]
    for filename in knowledge_files:
        filepath = os.path.join(INPUT_DIR, filename)
        print(f"Processing knowledge file: {filename}") # Debugging line
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for item in data.get('knowledge_list', []):
                    item_id = item.get('编号')
                    if not item_id:
                        print(f"Warning: Skipping knowledge item in {filename} due to missing '编号'.")
                        continue

                    sql = f'''
INSERT INTO kg.knowledge (id, level, name, type, core_content, principle, conditions, prerequisites, related_content, importance, exam_ways)
VALUES (
    {escape_sql_string(item_id)},
    {escape_sql_string(item.get('层次'))},
    {escape_sql_string(item.get('名称'))},
    {escape_sql_string(item.get('类型'))},
    {to_sql_jsonb(item.get('核心内容'))},
    {to_sql_jsonb(item.get('原理说明'))},
    {to_sql_jsonb(item.get('适用条件'))},
    {to_sql_text_array(item.get('前置知识'))},
    {to_sql_jsonb(item.get('关联内容'))},
    {escape_sql_string(item.get('重要程度'))},
    {to_sql_text_array(item.get('考查方式'))}
);
'''
                    sql_statements.append(sql)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {filename}: {e}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred while processing {filename}: {e}")
            continue

    # Process Methods
    method_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("method-") and f.endswith(".json")]
    for filename in method_files:
        filepath = os.path.join(INPUT_DIR, filename)
        print(f"Processing method file: {filename}") # Debugging line
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for item in data.get('method_list', []):
                    item_id = item.get('编号')
                    if not item_id:
                        print(f"Warning: Skipping method item in {filename} due to missing '编号'.")
                        continue

                    sql = f'''
INSERT INTO kg.methods (id, name, type, scenarios, steps, math_ideas, strategy, prerequisite_methods, common_errors, difficulty, location)
VALUES (
    {escape_sql_string(item_id)},
    {escape_sql_string(item.get('名称'))},
    {escape_sql_string(item.get('类型'))},
    {to_sql_jsonb(item.get('适用场景'))},
    {to_sql_jsonb(item.get('方法步骤'))},
    {to_sql_text_array(item.get('数学思想'))},
    {escape_sql_string(item.get('解题策略'))},
    {to_sql_text_array(item.get('前置方法'))},
    {to_sql_jsonb(item.get('常见错误'))},
    {item.get('难度等级', 'NULL')},
    {escape_sql_string(item.get('教材位置'))}
);
'''
                    sql_statements.append(sql)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {filename}: {e}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred while processing {filename}: {e}")
            continue

    # Process Problems and Link Tables
    problem_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("problems-") and f.endswith(".json")]
    for filename in problem_files:
        filepath = os.path.join(INPUT_DIR, filename)
        print(f"Processing problem file: {filename}") # Debugging line
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for item in data.get('problem_list', []):
                    problem_id = item.get('题目基本信息', {}).get('编号')
                    if not problem_id:
                        print(f"Warning: Skipping problem item in {filename} due to missing '编号'.")
                        continue

                    # Insert into kg.problems
                    sql = f'''
INSERT INTO kg.problems (id, type, source_info, content, question_types, difficulty)
VALUES (
    {escape_sql_string(problem_id)},
    {escape_sql_string(item.get('题目基本信息', {}).get('题目类型'))},
    {to_sql_jsonb(item.get('题目基本信息', {}).get('来源信息'))},
    {to_sql_jsonb(item.get('题目内容'))},
    {to_sql_jsonb(item.get('题型分类'))},
    {to_sql_jsonb(item.get('难度评估'))}
);
'''
                    sql_statements.append(sql)

                    # Insert into kg.problem_knowledge_link
                    knowledge_annotations = item.get('知识点标注', {})
                    for sub_question_key, annotations in knowledge_annotations.items():
                        for main_kp in annotations.get('主要考查', []):
                            kp_id = main_kp.get('知识点编号')
                            if kp_id:
                                sql = f'''
INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance)
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '主要考查');
'''
                                sql_statements.append(sql)
                        for aux_kp in annotations.get('辅助涉及', []):
                            kp_id = aux_kp.get('知识点编号')
                            if kp_id:
                                sql = f'''
INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance)
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '辅助涉及');
'''
                                sql_statements.append(sql)

                    # Insert into kg.problem_method_link
                    method_annotations = item.get('方法标注', {})
                    for sub_question_key, methods_list in method_annotations.items():
                        for method in methods_list:
                            method_id = method.get('方法编号')
                            if method_id:
                                sql = f'''
INSERT INTO kg.problem_method_link (problem_id, method_id)
VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(method_id)});'''
                                sql_statements.append(sql)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {filename}: {e}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred while processing {filename}: {e}")
            continue

    # Insert into kg.method_knowledge_link (from method files)
    # This needs to be done after all knowledge points are inserted
    # Re-process method files for links
    for filename in method_files:
        filepath = os.path.join(INPUT_DIR, filename)
        print(f"Processing method file for links: {filename}") # Debugging line
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for item in data.get('method_list', []):
                    method_id = item.get('编号')
                    if not method_id:
                        continue

                    # "支撑知识点" in method JSON
                    supporting_kps = item.get('支撑知识点', [])
                    for kp_full_name in supporting_kps:
                        # Attempt to extract ID using regex for "ID NAME" format
                        match = re.match(r'^([K]\d+-\d+-\d+-\d+)', kp_full_name)
                        if match:
                            kp_id = match.group(1)
                        else:
                            # If regex doesn't match, assume it's already just the ID
                            kp_id = kp_full_name

                        # print(f"DEBUG from {filename}: kp_full_name='{kp_full_name}', kp_id='{kp_id}'") # DEBUG PRINT
                        if kp_id:
                            sql = f'''
INSERT INTO kg.method_knowledge_link (method_id, knowledge_id)
VALUES ({escape_sql_string(method_id)}, {escape_sql_string(kp_id)});
'''
                            sql_statements.append(sql)
                        else:
                            print(f"Warning: Could not extract knowledge ID from '{kp_full_name}' in method '{method_id}'.")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for links in {filename}: {e}")
            continue
        except Exception as e:
            print(f"An unexpected error occurred while processing links in {filename}: {e}")
            continue

    # Write all statements to the output file
    with open(OUTPUT_SQL_FILE, 'w', encoding='utf-8') as f:
        for stmt in sql_statements:
            f.write(stmt.strip() + "\n\n")

    print(f"SQL insert statements generated successfully to {OUTPUT_SQL_FILE}")

# Execute the generation
generate_sql_inserts()