config.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. import os
  2. import sys
  3. # 导入app_config获取数据库等配置
  4. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  5. try:
  6. import app_config
  7. except ImportError:
  8. app_config = None
  9. # Schema Tools专用配置
  10. SCHEMA_TOOLS_CONFIG = {
  11. # 核心配置
  12. "default_db_connection": None, # 从命令行指定
  13. "default_business_context": "数据库管理系统",
  14. "output_directory": "training/generated_data",
  15. # 处理链配置
  16. "default_pipeline": "full",
  17. "available_pipelines": {
  18. "full": [
  19. "database_inspector",
  20. "data_sampler",
  21. "comment_generator",
  22. "ddl_generator",
  23. "doc_generator"
  24. ],
  25. "ddl_only": [
  26. "database_inspector",
  27. "data_sampler",
  28. "comment_generator",
  29. "ddl_generator"
  30. ],
  31. "analysis_only": [
  32. "database_inspector",
  33. "data_sampler",
  34. "comment_generator"
  35. ]
  36. },
  37. # 数据处理配置
  38. "sample_data_limit": 20, # 用于LLM分析的采样数据量
  39. "enum_detection_sample_limit": 5000, # 枚举检测时的采样限制
  40. "enum_max_distinct_values": 20, # 枚举字段最大不同值数量
  41. "enum_varchar_keywords": [ # VARCHAR枚举关键词
  42. "性别", "gender", "状态", "status", "类型", "type",
  43. "级别", "level", "方向", "direction", "品类", "classify",
  44. "模式", "mode", "格式", "format"
  45. ],
  46. "large_table_threshold": 1000000, # 大表阈值(行数)
  47. # 并发配置
  48. "max_concurrent_tables": 1, # 最大并发处理表数(建议保持1,避免LLM并发调用问题)
  49. # LLM配置
  50. "use_app_config_llm": True, # 是否使用app_config中的LLM配置
  51. "comment_generation_timeout": 30, # LLM调用超时时间(秒)
  52. "max_llm_retries": 3, # LLM调用最大重试次数
  53. # 系统表过滤配置
  54. "filter_system_tables": True, # 是否过滤系统表
  55. "custom_system_prefixes": [], # 用户自定义系统表前缀
  56. "custom_system_schemas": [], # 用户自定义系统schema
  57. # 权限与安全配置
  58. "check_permissions": True, # 是否检查数据库权限
  59. "require_select_permission": True, # 是否要求SELECT权限
  60. "allow_readonly_database": True, # 是否允许只读数据库
  61. # 错误处理配置
  62. "continue_on_error": True, # 遇到错误是否继续
  63. "max_table_failures": 5, # 最大允许失败表数
  64. "skip_large_tables": False, # 是否跳过超大表
  65. "max_table_size": 10000000, # 最大表行数限制
  66. # 文件配置
  67. "ddl_file_suffix": ".ddl",
  68. "doc_file_suffix": "_detail.md",
  69. "log_file": "schema_tools.log",
  70. "create_subdirectories": False, # 是否创建ddl/docs子目录
  71. # 输出格式配置
  72. "include_sample_data_in_comments": True, # 注释中是否包含示例数据
  73. "max_comment_length": 500, # 最大注释长度
  74. "include_field_statistics": True, # 是否包含字段统计信息
  75. # 调试配置
  76. "debug_mode": False, # 调试模式
  77. "save_llm_prompts": False, # 是否保存LLM提示词
  78. "save_llm_responses": False, # 是否保存LLM响应
  79. # Question-SQL生成配置
  80. "qs_generation": {
  81. "max_tables": 20, # 最大表数量限制
  82. "theme_count": 5, # LLM生成的主题数量
  83. "questions_per_theme": 10, # 每个主题生成的问题数
  84. "max_concurrent_themes": 1, # 并行处理的主题数量
  85. "continue_on_theme_error": True, # 主题生成失败是否继续
  86. "save_intermediate": True, # 是否保存中间结果
  87. "output_file_prefix": "qs", # 输出文件前缀
  88. }
  89. }
  90. # 从app_config获取相关配置(如果可用)
  91. if app_config:
  92. # 继承数据库配置
  93. if hasattr(app_config, 'PGVECTOR_CONFIG'):
  94. pgvector_config = app_config.PGVECTOR_CONFIG
  95. if not SCHEMA_TOOLS_CONFIG["default_db_connection"]:
  96. SCHEMA_TOOLS_CONFIG["default_db_connection"] = (
  97. f"postgresql://{pgvector_config['user']}:{pgvector_config['password']}"
  98. f"@{pgvector_config['host']}:{pgvector_config['port']}/{pgvector_config['dbname']}"
  99. )
  100. def get_config():
  101. """获取当前配置"""
  102. return SCHEMA_TOOLS_CONFIG
  103. def update_config(**kwargs):
  104. """更新配置"""
  105. SCHEMA_TOOLS_CONFIG.update(kwargs)
  106. def validate_config():
  107. """验证配置有效性"""
  108. errors = []
  109. # 检查必要配置
  110. if SCHEMA_TOOLS_CONFIG["max_concurrent_tables"] <= 0:
  111. errors.append("max_concurrent_tables 必须大于0")
  112. if SCHEMA_TOOLS_CONFIG["sample_data_limit"] <= 0:
  113. errors.append("sample_data_limit 必须大于0")
  114. # 检查处理链配置
  115. default_pipeline = SCHEMA_TOOLS_CONFIG["default_pipeline"]
  116. available_pipelines = SCHEMA_TOOLS_CONFIG["available_pipelines"]
  117. if default_pipeline not in available_pipelines:
  118. errors.append(f"default_pipeline '{default_pipeline}' 不在 available_pipelines 中")
  119. if errors:
  120. raise ValueError("配置验证失败:\n" + "\n".join(f" - {error}" for error in errors))
  121. return True
  122. # 启动时验证配置
  123. try:
  124. validate_config()
  125. except ValueError as e:
  126. print(f"警告: {e}")