import json import os import re # --- Configuration --- INPUT_DIR = "/Users/robertmaxwell/note/知识图谱/教科书-数学/all_副本/" OUTPUT_SQL_FILE = os.path.join(INPUT_DIR, "knowledge_graph_inserts.sql") # --- Helper Functions --- def escape_sql_string(s): """Escapes single quotes in a string for SQL.""" if s is None: return 'NULL' # Replace single quotes with two single quotes for SQL escaping # Also, handle potential triple single quotes if they somehow appear in data and conflict with f'''...''' return f"'{str(s).replace("'", "''").replace("'''", "''''''")}'" def to_sql_jsonb(data): """Converts a Python object to a SQL JSONB string.""" if data is None: # Handle None explicitly return 'NULL' # Ensure only JSON-serializable types are passed try: # json.dumps handles internal quotes, so we just need to escape the outer single quotes if any json_str = json.dumps(data, ensure_ascii=False) return escape_sql_string(json_str) except TypeError: print(f"Warning: Non-JSON serializable data found: {data}. Returning NULL.") return 'NULL' def to_sql_text_array(arr): """Converts a Python list of strings to a SQL TEXT[] array string.""" if not arr: # Handle empty list explicitly return 'NULL' # Filter out None values from the array if any, and ensure all are strings clean_arr = [str(item) for item in arr if item is not None] if not clean_arr: return 'NULL' escaped_elements = [escape_sql_string(item) for item in clean_arr] return f"ARRAY[{', '.join(escaped_elements)}]" def parse_id_for_chapter(item_id): """Parses the item ID to extract chapter information (if needed, though not used in current schema).""" match = re.match(r'^[KMT](\d+)-(\d+)(?:-(\d+))?', item_id) if match: chapter_num = match.group(1) section_num = match.group(2) subsection_num = match.group(3) if match.group(3) else '0' return f"Chapter {chapter_num}.{section_num}.{subsection_num}" return "Unknown Chapter" # --- Main Script Logic --- def generate_sql_inserts(): sql_statements = [] # Add schema and table creation DDL sql_statements.append(''' -- 首先,创建一个Schema来组织知识图谱相关的表,这是一种好的实践 CREATE SCHEMA IF NOT EXISTS kg; -- 1. 知识点表 (Knowledge Table) CREATE TABLE IF NOT EXISTS kg.knowledge ( id VARCHAR(50) PRIMARY KEY, -- 对应 "编号" level VARCHAR(50), -- 对应 "层次" name TEXT NOT NULL, -- 对应 "名称" type VARCHAR(50), -- 对应 "类型" core_content JSONB, -- 【JSONB】用于存储复杂的JSON对象 principle JSONB, -- 【JSONB】 conditions JSONB, -- 【JSONB】 prerequisites VARCHAR(50)[], -- 【TEXT[]】用于存储编号数组 related_content JSONB, -- 【JSONB】 importance VARCHAR(50), -- 对应 "重要程度" exam_ways TEXT[] -- 【TEXT[]】用于存储字符串数组 ); -- 2. 方法表 (Method Table) CREATE TABLE IF NOT EXISTS kg.methods ( id VARCHAR(50) PRIMARY KEY, -- 对应 "编号" name TEXT NOT NULL, -- 对应 "名称" type VARCHAR(50), -- 对应 "类型" scenarios JSONB, -- 【JSONB】适用场景 steps JSONB, -- 【JSONB】方法步骤 math_ideas TEXT[], -- 【TEXT[]】数学思想 strategy TEXT, -- 解题策略 prerequisite_methods VARCHAR(50)[], -- 【TEXT[]】前置方法 common_errors JSONB, -- 【JSONB】常见错误 difficulty SMALLINT, -- 难度等级 location TEXT -- 教材位置 ); -- 3. 题目表 (Problem Table) CREATE TABLE IF NOT EXISTS kg.problems ( id VARCHAR(50) PRIMARY KEY, -- 对应 "编号" type VARCHAR(50), -- 题目类型 source_info JSONB, -- 【JSONB】来源信息 content JSONB, -- 【JSONB】题目内容 question_types JSONB, -- 【JSONB】题型分类 difficulty JSONB -- 【JSONB】难度评估 ); -- 4. 关联表 (Link Tables) - 用于处理多对多关系 -- 方法支撑的知识点 CREATE TABLE IF NOT EXISTS kg.method_knowledge_link ( method_id VARCHAR(50) REFERENCES kg.methods(id), knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id), PRIMARY KEY (method_id, knowledge_id) ); -- 题目考查的知识点 CREATE TABLE IF NOT EXISTS kg.problem_knowledge_link ( problem_id VARCHAR(50) REFERENCES kg.problems(id), knowledge_id VARCHAR(50) REFERENCES kg.knowledge(id), -- 可以增加一个字段表示是“主要考查”还是“辅助涉及” relevance VARCHAR(20) DEFAULT '主要考查', PRIMARY KEY (problem_id, knowledge_id, relevance) ); -- 题目使用的方法 CREATE TABLE IF NOT EXISTS kg.problem_method_link ( problem_id VARCHAR(50) REFERENCES kg.problems(id), method_id VARCHAR(50) REFERENCES kg.methods(id), PRIMARY KEY (problem_id, method_id) ); ''') sql_statements.append("\n-- --- INSERT DATA ---\n") # Process Knowledge Points knowledge_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("knowledge-") and f.endswith(".json")] for filename in knowledge_files: filepath = os.path.join(INPUT_DIR, filename) print(f"Processing knowledge file: {filename}") # Debugging line try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) for item in data.get('knowledge_list', []): item_id = item.get('编号') if not item_id: print(f"Warning: Skipping knowledge item in {filename} due to missing '编号'.") continue sql = f''' INSERT INTO kg.knowledge (id, level, name, type, core_content, principle, conditions, prerequisites, related_content, importance, exam_ways) VALUES ( {escape_sql_string(item_id)}, {escape_sql_string(item.get('层次'))}, {escape_sql_string(item.get('名称'))}, {escape_sql_string(item.get('类型'))}, {to_sql_jsonb(item.get('核心内容'))}, {to_sql_jsonb(item.get('原理说明'))}, {to_sql_jsonb(item.get('适用条件'))}, {to_sql_text_array(item.get('前置知识'))}, {to_sql_jsonb(item.get('关联内容'))}, {escape_sql_string(item.get('重要程度'))}, {to_sql_text_array(item.get('考查方式'))} ); ''' sql_statements.append(sql) except json.JSONDecodeError as e: print(f"Error decoding JSON in {filename}: {e}") continue except Exception as e: print(f"An unexpected error occurred while processing {filename}: {e}") continue # Process Methods method_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("method-") and f.endswith(".json")] for filename in method_files: filepath = os.path.join(INPUT_DIR, filename) print(f"Processing method file: {filename}") # Debugging line try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) for item in data.get('method_list', []): item_id = item.get('编号') if not item_id: print(f"Warning: Skipping method item in {filename} due to missing '编号'.") continue sql = f''' INSERT INTO kg.methods (id, name, type, scenarios, steps, math_ideas, strategy, prerequisite_methods, common_errors, difficulty, location) VALUES ( {escape_sql_string(item_id)}, {escape_sql_string(item.get('名称'))}, {escape_sql_string(item.get('类型'))}, {to_sql_jsonb(item.get('适用场景'))}, {to_sql_jsonb(item.get('方法步骤'))}, {to_sql_text_array(item.get('数学思想'))}, {escape_sql_string(item.get('解题策略'))}, {to_sql_text_array(item.get('前置方法'))}, {to_sql_jsonb(item.get('常见错误'))}, {item.get('难度等级', 'NULL')}, {escape_sql_string(item.get('教材位置'))} ); ''' sql_statements.append(sql) except json.JSONDecodeError as e: print(f"Error decoding JSON in {filename}: {e}") continue except Exception as e: print(f"An unexpected error occurred while processing {filename}: {e}") continue # Process Problems and Link Tables problem_files = [f for f in os.listdir(INPUT_DIR) if f.startswith("problems-") and f.endswith(".json")] for filename in problem_files: filepath = os.path.join(INPUT_DIR, filename) print(f"Processing problem file: {filename}") # Debugging line try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) for item in data.get('problem_list', []): problem_id = item.get('题目基本信息', {}).get('编号') if not problem_id: print(f"Warning: Skipping problem item in {filename} due to missing '编号'.") continue # Insert into kg.problems sql = f''' INSERT INTO kg.problems (id, type, source_info, content, question_types, difficulty) VALUES ( {escape_sql_string(problem_id)}, {escape_sql_string(item.get('题目基本信息', {}).get('题目类型'))}, {to_sql_jsonb(item.get('题目基本信息', {}).get('来源信息'))}, {to_sql_jsonb(item.get('题目内容'))}, {to_sql_jsonb(item.get('题型分类'))}, {to_sql_jsonb(item.get('难度评估'))} ); ''' sql_statements.append(sql) # Insert into kg.problem_knowledge_link knowledge_annotations = item.get('知识点标注', {}) for sub_question_key, annotations in knowledge_annotations.items(): for main_kp in annotations.get('主要考查', []): kp_id = main_kp.get('知识点编号') if kp_id: sql = f''' INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance) VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '主要考查'); ''' sql_statements.append(sql) for aux_kp in annotations.get('辅助涉及', []): kp_id = aux_kp.get('知识点编号') if kp_id: sql = f''' INSERT INTO kg.problem_knowledge_link (problem_id, knowledge_id, relevance) VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(kp_id)}, '辅助涉及'); ''' sql_statements.append(sql) # Insert into kg.problem_method_link method_annotations = item.get('方法标注', {}) for sub_question_key, methods_list in method_annotations.items(): for method in methods_list: method_id = method.get('方法编号') if method_id: sql = f''' INSERT INTO kg.problem_method_link (problem_id, method_id) VALUES ({escape_sql_string(problem_id)}, {escape_sql_string(method_id)});''' sql_statements.append(sql) except json.JSONDecodeError as e: print(f"Error decoding JSON in {filename}: {e}") continue except Exception as e: print(f"An unexpected error occurred while processing {filename}: {e}") continue # Insert into kg.method_knowledge_link (from method files) # This needs to be done after all knowledge points are inserted # Re-process method files for links for filename in method_files: filepath = os.path.join(INPUT_DIR, filename) print(f"Processing method file for links: {filename}") # Debugging line try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) for item in data.get('method_list', []): method_id = item.get('编号') if not method_id: continue # "支撑知识点" in method JSON supporting_kps = item.get('支撑知识点', []) for kp_full_name in supporting_kps: # Attempt to extract ID using regex for "ID NAME" format match = re.match(r'^([K]\d+-\d+-\d+-\d+)', kp_full_name) if match: kp_id = match.group(1) else: # If regex doesn't match, assume it's already just the ID kp_id = kp_full_name # print(f"DEBUG from {filename}: kp_full_name='{kp_full_name}', kp_id='{kp_id}'") # DEBUG PRINT if kp_id: sql = f''' INSERT INTO kg.method_knowledge_link (method_id, knowledge_id) VALUES ({escape_sql_string(method_id)}, {escape_sql_string(kp_id)}); ''' sql_statements.append(sql) else: print(f"Warning: Could not extract knowledge ID from '{kp_full_name}' in method '{method_id}'.") except json.JSONDecodeError as e: print(f"Error decoding JSON for links in {filename}: {e}") continue except Exception as e: print(f"An unexpected error occurred while processing links in {filename}: {e}") continue # Write all statements to the output file with open(OUTPUT_SQL_FILE, 'w', encoding='utf-8') as f: for stmt in sql_statements: f.write(stmt.strip() + "\n\n") print(f"SQL insert statements generated successfully to {OUTPUT_SQL_FILE}") # Execute the generation generate_sql_inserts()