note/知识图谱/教科书-数学/all/data_import.py

#!/usr/bin/env python3
"""
高中数学知识图谱数据导入脚本
将JSON格式的知识图谱数据导入到PostgreSQL数据库
"""

import json
import os
import re
import psycopg2
from psycopg2.extras import Json
from typing import Dict, List, Any, Optional, Tuple
import logging
from datetime import datetime

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('data_import.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class KnowledgeGraphImporter:
    """知识图谱数据导入器"""

    def __init__(self, db_config: Dict[str, Any]):
        """
        初始化导入器

        Args:
            db_config: 数据库连接配置
        """
        self.db_config = db_config
        self.conn = None
        self.cursor = None

    def connect(self):
        """连接数据库"""
        try:
            self.conn = psycopg2.connect(**self.db_config)
            self.cursor = self.conn.cursor()
            logger.info("数据库连接成功")
        except Exception as e:
            logger.error(f"数据库连接失败: {e}")
            raise

    def disconnect(self):
        """断开数据库连接"""
        if self.cursor:
            self.cursor.close()
        if self.conn:
            self.conn.close()
        logger.info("数据库连接已关闭")

    def parse_chapter_info(self, filename: str) -> Tuple[str, str]:
        """
        从文件名解析章节信息

        Args:
            filename: 文件名（如 knowledge-必修第一章-集合与常用逻辑用语.json）

        Returns:
            (volume_type, chapter_name): (必修/选择性必修, 章节名称)
        """
        # 匹配格式: knowledge-必修第一章-集合与常用逻辑用语.json
        pattern = r'^(knowledge|method|problems)-(必修|选择性必修)(第[一二三四五六七八九十]+章)-(.+)\.json$'
        match = re.match(pattern, filename)

        if match:
            _, volume_type, chapter_part, chapter_name = match.groups()
            return volume_type, chapter_name
        else:
            # 如果不匹配标准格式，返回默认值
            logger.warning(f"文件名格式不标准: {filename}")
            return "必修", "未知章节"

    def get_or_create_textbook(self, volume_type: str, volume_number: int) -> int:
        """
        获取或创建教材记录

        Args:
            volume_type: 必修/选择性必修
            volume_number: 册数

        Returns:
            textbook_id: 教材ID
        """
        # 查询是否已存在
        query = """
        SELECT id FROM textbooks
        WHERE volume_type = %s AND volume_number = %s
        """
        self.cursor.execute(query, (volume_type, volume_number))
        result = self.cursor.fetchone()

        if result:
            return result[0]

        # 创建新教材
        insert_query = """
        INSERT INTO textbooks (name, volume_type, volume_number)
        VALUES (%s, %s, %s)
        RETURNING id
        """
        self.cursor.execute(insert_query, (
            f"高中数学{volume_type}第{volume_number}册",
            volume_type,
            volume_number
        ))
        return self.cursor.fetchone()[0]

    def get_or_create_chapter(self, textbook_id: int, chapter_name: str) -> int:
        """
        获取或创建章节记录

        Args:
            textbook_id: 教材ID
            chapter_name: 章节名称

        Returns:
            chapter_id: 章节ID
        """
        # 查询是否已存在
        query = """
        SELECT id FROM chapters
        WHERE textbook_id = %s AND chapter_name = %s
        """
        self.cursor.execute(query, (textbook_id, chapter_name))
        result = self.cursor.fetchone()

        if result:
            return result[0]

        # 创建新章节
        insert_query = """
        INSERT INTO chapters (textbook_id, chapter_name, chapter_number, chapter_order)
        VALUES (%s, %s, %s, %s)
        RETURNING id
        """

        # 尝试从章节名称中提取章节号
        chapter_number = self.extract_chapter_number(chapter_name)
        self.cursor.execute(insert_query, (
            textbook_id,
            chapter_name,
            chapter_number,
            chapter_number
        ))
        return self.cursor.fetchone()[0]

    def extract_chapter_number(self, chapter_name: str) -> int:
        """从章节名称中提取章节号"""
        chinese_numbers = {'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
                          '六': 6, '七': 7, '八': 8, '九': 9, '十': 10}

        for chinese, number in chinese_numbers.items():
            if chinese in chapter_name:
                return number

        return 1  # 默认返回1

    def import_knowledge_data(self, file_path: str) -> int:
        """
        导入知识点数据

        Args:
            file_path: JSON文件路径

        Returns:
            导入的知识点数量
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        filename = os.path.basename(file_path)
        volume_type, chapter_name = self.parse_chapter_info(filename)

        # 获取教材和章节ID
        volume_number = 1  # 默认第一册，可以根据实际需要调整
        textbook_id = self.get_or_create_textbook(volume_type, volume_number)
        chapter_id = self.get_or_create_chapter(textbook_id, chapter_name)

        imported_count = 0

        for knowledge in data.get('knowledge_list', []):
            try:
                # 插入知识点
                insert_query = """
                INSERT INTO knowledge_points (
                    code, name, level, type, importance, core_content,
                    principle_explanation, application_conditions, related_content,
                    examination_methods, textbook_location
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (code) DO UPDATE SET
                    name = EXCLUDED.name,
                    level = EXCLUDED.level,
                    type = EXCLUDED.type,
                    importance = EXCLUDED.importance,
                    core_content = EXCLUDED.core_content,
                    principle_explanation = EXCLUDED.principle_explanation,
                    application_conditions = EXCLUDED.application_conditions,
                    related_content = EXCLUDED.related_content,
                    examination_methods = EXCLUDED.examination_methods,
                    textbook_location = EXCLUDED.textbook_location,
                    updated_at = CURRENT_TIMESTAMP
                RETURNING id
                """

                self.cursor.execute(insert_query, (
                    knowledge.get('编号'),
                    knowledge.get('名称'),
                    knowledge.get('层次'),
                    knowledge.get('类型'),
                    knowledge.get('重要程度'),
                    Json(knowledge.get('核心内容', {})),
                    Json(knowledge.get('原理说明', {})),
                    Json(knowledge.get('适用条件', {})),
                    Json(knowledge.get('关联内容', {})),
                    knowledge.get('考查方式', []),
                    knowledge.get('关联内容', {}).get('教材位置', '')
                ))

                knowledge_id = self.cursor.fetchone()[0]
                imported_count += 1

                # 处理前置知识关系
                self.import_knowledge_relations(knowledge_id, knowledge.get('前置知识', []), '前置知识')

                # 处理包含的子知识点
                related_content = knowledge.get('关联内容', {})
                sub_knowledge = related_content.get('包含的子知识点', [])
                self.import_knowledge_relations(knowledge_id, sub_knowledge, '包含关系')

            except Exception as e:
                logger.error(f"导入知识点失败 {knowledge.get('编号', 'unknown')}: {e}")
                continue

        logger.info(f"从 {file_path} 导入了 {imported_count} 个知识点")
        return imported_count

    def import_method_data(self, file_path: str) -> int:
        """
        导入方法数据

        Args:
            file_path: JSON文件路径

        Returns:
            导入的方法数量
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        filename = os.path.basename(file_path)
        volume_type, chapter_name = self.parse_chapter_info(filename)

        # 获取教材和章节ID
        volume_number = 1  # 默认第一册
        textbook_id = self.get_or_create_textbook(volume_type, volume_number)
        chapter_id = self.get_or_create_chapter(textbook_id, chapter_name)

        imported_count = 0

        for method in data.get('method_list', []):
            try:
                # 插入方法主记录
                insert_query = """
                INSERT INTO methods (
                    code, name, type, difficulty_level, application_scenarios,
                    mathematical_thoughts, problem_solving_strategy, textbook_location, description
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (code) DO UPDATE SET
                    name = EXCLUDED.name,
                    type = EXCLUDED.type,
                    difficulty_level = EXCLUDED.difficulty_level,
                    application_scenarios = EXCLUDED.application_scenarios,
                    mathematical_thoughts = EXCLUDED.mathematical_thoughts,
                    problem_solving_strategy = EXCLUDED.problem_solving_strategy,
                    textbook_location = EXCLUDED.textbook_location,
                    description = EXCLUDED.description,
                    updated_at = CURRENT_TIMESTAMP
                RETURNING id
                """

                self.cursor.execute(insert_query, (
                    method.get('编号'),
                    method.get('名称'),
                    method.get('类型'),
                    method.get('难度等级'),
                    Json(method.get('适用场景', {})),
                    method.get('数学思想', []),
                    method.get('解题策略'),
                    method.get('教材位置'),
                    method.get('名称')  # 用名称作为描述
                ))

                method_id = self.cursor.fetchone()[0]
                imported_count += 1

                # 插入方法步骤
                self.import_method_steps(method_id, method.get('方法步骤', []))

                # 插入常见错误
                self.import_method_errors(method_id, method.get('常见错误', []))

                # 处理支撑知识点
                self.import_method_support_knowledge(method_id, method.get('支撑知识点', []))

            except Exception as e:
                logger.error(f"导入方法失败 {method.get('编号', 'unknown')}: {e}")
                continue

        logger.info(f"从 {file_path} 导入了 {imported_count} 个方法")
        return imported_count

    def import_problem_data(self, file_path: str) -> int:
        """
        导入题目数据

        Args:
            file_path: JSON文件路径

        Returns:
            导入的题目数量
        """
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        filename = os.path.basename(file_path)
        volume_type, chapter_name = self.parse_chapter_info(filename)

        # 获取教材和章节ID
        volume_number = 1  # 默认第一册
        textbook_id = self.get_or_create_textbook(volume_type, volume_number)
        chapter_id = self.get_or_create_chapter(textbook_id, chapter_name)

        imported_count = 0

        for problem in data.get('problem_list', []):
            try:
                # 插入题目主记录
                insert_query = """
                INSERT INTO problems (
                    code, title, problem_type, source_info, stem, full_content,
                    problem_category, difficulty_info, textbook_location, subquestion_count
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT (code) DO UPDATE SET
                    title = EXCLUDED.title,
                    problem_type = EXCLUDED.problem_type,
                    source_info = EXCLUDED.source_info,
                    stem = EXCLUDED.stem,
                    full_content = EXCLUDED.full_content,
                    problem_category = EXCLUDED.problem_category,
                    difficulty_info = EXCLUDED.difficulty_info,
                    textbook_location = EXCLUDED.textbook_location,
                    subquestion_count = EXCLUDED.subquestion_count,
                    updated_at = CURRENT_TIMESTAMP
                RETURNING id
                """

                basic_info = problem.get('题目基本信息', {})
                content = problem.get('题目内容', {})
                difficulty = problem.get('难度评估', {})
                category = problem.get('题型分类', {})

                self.cursor.execute(insert_query, (
                    basic_info.get('编号'),
                    content.get('题干'),
                    basic_info.get('题目类型'),
                    Json(basic_info.get('来源信息', {})),
                    content.get('题干'),
                    content.get('完整题目'),
                    Json(category),
                    Json(difficulty),
                    basic_info.get('来源信息', {}).get('页码', ''),
                    len(content.get('问题', []))
                ))

                problem_id = self.cursor.fetchone()[0]
                imported_count += 1

                # 插入小题
                subquestion_ids = self.import_problem_subquestions(problem_id, content.get('问题', []))

                # 插入知识点标注
                self.import_problem_knowledge_tags(problem_id, subquestion_ids,
                                                problem.get('知识点标注', {}))

                # 插入方法标注
                self.import_problem_method_tags(problem_id, subquestion_ids,
                                             problem.get('方法标注', {}))

            except Exception as e:
                logger.error(f"导入题目失败 {problem.get('题目基本信息', {}).get('编号', 'unknown')}: {e}")
                continue

        logger.info(f"从 {file_path} 导入了 {imported_count} 个题目")
        return imported_count

    def import_knowledge_relations(self, source_knowledge_id: int, target_codes: List[str], relation_type: str):
        """导入知识点关系"""
        for code in target_codes:
            try:
                # 查找目标知识点ID
                self.cursor.execute("SELECT id FROM knowledge_points WHERE code = %s", (code,))
                result = self.cursor.fetchone()

                if result:
                    target_knowledge_id = result[0]
                    insert_query = """
                    INSERT INTO knowledge_relations (source_knowledge_id, target_knowledge_id, relation_type)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (source_knowledge_id, target_knowledge_id, relation_type)
                    DO NOTHING
                    """
                    self.cursor.execute(insert_query, (source_knowledge_id, target_knowledge_id, relation_type))
            except Exception as e:
                logger.error(f"导入知识点关系失败: {e}")

    def import_method_steps(self, method_id: int, steps: List[Dict]):
        """导入方法步骤"""
        for i, step in enumerate(steps, 1):
            try:
                insert_query = """
                INSERT INTO method_steps (method_id, step_number, step_description, key_points, common_mistakes)
                VALUES (%s, %s, %s, %s, %s)
                ON CONFLICT (method_id, step_number)
                DO UPDATE SET
                    step_description = EXCLUDED.step_description,
                    key_points = EXCLUDED.key_points,
                    common_mistakes = EXCLUDED.common_mistakes
                """
                self.cursor.execute(insert_query, (
                    method_id,
                    i,
                    step.get('步骤描述'),
                    step.get('关键要点', []),
                    step.get('常见错误', [])
                ))
            except Exception as e:
                logger.error(f"导入方法步骤失败: {e}")

    def import_method_errors(self, method_id: int, errors: List[Dict]):
        """导入方法常见错误"""
        for error in errors:
            try:
                insert_query = """
                INSERT INTO method_common_errors (method_id, error_description, cause_analysis, correct_approach)
                VALUES (%s, %s, %s, %s)
                """
                self.cursor.execute(insert_query, (
                    method_id,
                    error.get('错误描述'),
                    error.get('原因'),
                    error.get('正确做法')
                ))
            except Exception as e:
                logger.error(f"导入方法错误失败: {e}")

    def import_method_support_knowledge(self, method_id: int, knowledge_codes: List[str]):
        """导入方法支撑知识点"""
        for code in knowledge_codes:
            try:
                # 查找知识点ID
                self.cursor.execute("SELECT id FROM knowledge_points WHERE code = %s", (code,))
                result = self.cursor.fetchone()

                if result:
                    knowledge_id = result[0]
                    insert_query = """
                    INSERT INTO method_support_knowledge (method_id, knowledge_id, support_type)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (method_id, knowledge_id) DO NOTHING
                    """
                    self.cursor.execute(insert_query, (method_id, knowledge_id, '直接支撑'))
            except Exception as e:
                logger.error(f"导入方法支撑知识点失败: {e}")

    def import_problem_subquestions(self, problem_id: int, questions: List[str]) -> List[int]:
        """导入小题"""
        subquestion_ids = []
        for i, question in enumerate(questions, 1):
            try:
                insert_query = """
                INSERT INTO problem_subquestions (problem_id, subquestion_number, content)
                VALUES (%s, %s, %s)
                RETURNING id
                """
                self.cursor.execute(insert_query, (problem_id, f"({i})", question))
                subquestion_ids.append(self.cursor.fetchone()[0])
            except Exception as e:
                logger.error(f"导入小题失败: {e}")
        return subquestion_ids

    def import_problem_knowledge_tags(self, problem_id: int, subquestion_ids: List[int], knowledge_tags: Dict):
        """导入题目知识点标注"""
        for subquestion_key, tag_info in knowledge_tags.items():
            try:
                # 解析小题编号
                subquestion_idx = self.parse_subquestion_key(subquestion_key)
                subquestion_id = subquestion_ids[subquestion_idx - 1] if subquestion_idx <= len(subquestion_ids) else None

                # 处理主要考查的知识点
                for knowledge in tag_info.get('主要考查', []):
                    self.insert_problem_knowledge_tag(problem_id, subquestion_id,
                                                    knowledge.get('知识点编号'), '主要考查')

                # 处理辅助涉及的知识点
                for knowledge in tag_info.get('辅助涉及', []):
                    self.insert_problem_knowledge_tag(problem_id, subquestion_id,
                                                    knowledge.get('知识点编号'), '辅助涉及')
            except Exception as e:
                logger.error(f"导入题目知识点标注失败: {e}")

    def import_problem_method_tags(self, problem_id: int, subquestion_ids: List[int], method_tags: Dict):
        """导入题目方法标注"""
        for subquestion_key, methods in method_tags.items():
            try:
                # 解析小题编号
                subquestion_idx = self.parse_subquestion_key(subquestion_key)
                subquestion_id = subquestion_ids[subquestion_idx - 1] if subquestion_idx <= len(subquestion_ids) else None

                for method in methods:
                    self.insert_problem_method_tag(problem_id, subquestion_id,
                                                 method.get('方法编号'), '主要方法')
            except Exception as e:
                logger.error(f"导入题目方法标注失败: {e}")

    def insert_problem_knowledge_tag(self, problem_id: int, subquestion_id: int, knowledge_code: str, tag_type: str):
        """插入题目知识点标注"""
        try:
            # 查找知识点ID
            self.cursor.execute("SELECT id FROM knowledge_points WHERE code = %s", (knowledge_code,))
            result = self.cursor.fetchone()

            if result:
                knowledge_id = result[0]
                insert_query = """
                INSERT INTO problem_knowledge_tags (problem_id, subquestion_id, knowledge_id, tag_type)
                VALUES (%s, %s, %s, %s)
                ON CONFLICT (problem_id, subquestion_id, knowledge_id, tag_type) DO NOTHING
                """
                self.cursor.execute(insert_query, (problem_id, subquestion_id, knowledge_id, tag_type))
        except Exception as e:
            logger.error(f"插入题目知识点标注失败: {e}")

    def insert_problem_method_tag(self, problem_id: int, subquestion_id: int, method_code: str, usage_type: str):
        """插入题目方法标注"""
        try:
            # 查找方法ID
            self.cursor.execute("SELECT id FROM methods WHERE code = %s", (method_code,))
            result = self.cursor.fetchone()

            if result:
                method_id = result[0]
                insert_query = """
                INSERT INTO problem_method_tags (problem_id, subquestion_id, method_id, usage_frequency)
                VALUES (%s, %s, %s, %s)
                ON CONFLICT (problem_id, subquestion_id, method_id) DO NOTHING
                """
                self.cursor.execute(insert_query, (problem_id, subquestion_id, method_id, usage_type))
        except Exception as e:
            logger.error(f"插入题目方法标注失败: {e}")

    def parse_subquestion_key(self, key: str) -> int:
        """解析小题键名获取编号"""
        if key == '小题1':
            return 1
        elif key == '小题2':
            return 2
        elif key == '小题3':
            return 3
        elif key == '小题1':
            return 1
        else:
            # 尝试从键名中提取数字
            import re
            match = re.search(r'\d+', key)
            return int(match.group()) if match else 1

    def import_all_files(self, directory: str):
        """
        导入目录下的所有JSON文件

        Args:
            directory: JSON文件目录路径
        """
        if not os.path.exists(directory):
            logger.error(f"目录不存在: {directory}")
            return

        files = [f for f in os.listdir(directory) if f.endswith('.json')]
        total_imported = {
            'knowledge': 0,
            'method': 0,
            'problem': 0
        }

        for filename in files:
            file_path = os.path.join(directory, filename)

            try:
                if filename.startswith('knowledge-'):
                    count = self.import_knowledge_data(file_path)
                    total_imported['knowledge'] += count
                elif filename.startswith('method-'):
                    count = self.import_method_data(file_path)
                    total_imported['method'] += count
                elif filename.startswith('problems-'):
                    count = self.import_problem_data(file_path)
                    total_imported['problem'] += count
                else:
                    logger.warning(f"未知文件类型，跳过: {filename}")
                    continue

                self.conn.commit()

            except Exception as e:
                logger.error(f"导入文件失败 {filename}: {e}")
                self.conn.rollback()
                continue

        logger.info("导入完成统计:")
        logger.info(f"知识点: {total_imported['knowledge']} 个")
        logger.info(f"方法: {total_imported['method']} 个")
        logger.info(f"题目: {total_imported['problem']} 个")

def main():
    """主函数"""
    # 数据库配置
    db_config = {
        'host': 'localhost',
        'port': 5432,
        'database': 'math_knowledge_graph',
        'user': 'postgres',
        'password': 'your_password'  # 请修改为实际密码
    }

    # JSON文件目录
    json_directory = '/Users/robertmaxwell/note/知识图谱/教科书-数学/all'

    # 创建导入器并执行导入
    importer = KnowledgeGraphImporter(db_config)

    try:
        importer.connect()
        importer.import_all_files(json_directory)
    except Exception as e:
        logger.error(f"导入过程中发生错误: {e}")
    finally:
        importer.disconnect()

if __name__ == "__main__":
    main()