config.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import os
  2. import sys
  3. # 导入app_config获取数据库等配置
  4. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  5. try:
  6. import app_config
  7. except ImportError:
  8. app_config = None
  9. # Schema Tools专用配置
  10. SCHEMA_TOOLS_CONFIG = {
  11. # 核心配置
  12. "default_db_connection": None, # 从命令行指定
  13. "default_business_context": "数据库管理系统",
  14. "output_directory": "training/generated_data",
  15. # 处理链配置
  16. "default_pipeline": "full",
  17. "available_pipelines": {
  18. "full": [
  19. "database_inspector",
  20. "data_sampler",
  21. "comment_generator",
  22. "ddl_generator",
  23. "doc_generator"
  24. ],
  25. "ddl_only": [
  26. "database_inspector",
  27. "data_sampler",
  28. "comment_generator",
  29. "ddl_generator"
  30. ],
  31. "analysis_only": [
  32. "database_inspector",
  33. "data_sampler",
  34. "comment_generator"
  35. ]
  36. },
  37. # 数据处理配置
  38. "sample_data_limit": 20, # 用于LLM分析的采样数据量
  39. "enum_detection_sample_limit": 5000, # 枚举检测时的采样限制
  40. "enum_max_distinct_values": 20, # 枚举字段最大不同值数量
  41. "enum_varchar_keywords": [ # VARCHAR枚举关键词
  42. "性别", "gender", "状态", "status", "类型", "type",
  43. "级别", "level", "方向", "direction", "品类", "classify",
  44. "模式", "mode", "格式", "format"
  45. ],
  46. "large_table_threshold": 1000000, # 大表阈值(行数)
  47. # 并发配置
  48. "max_concurrent_tables": 1, # 最大并发处理表数(建议保持1,避免LLM并发调用问题)
  49. # LLM配置
  50. "use_app_config_llm": True, # 是否使用app_config中的LLM配置
  51. "comment_generation_timeout": 30, # LLM调用超时时间(秒)
  52. "max_llm_retries": 3, # LLM调用最大重试次数
  53. # 系统表过滤配置
  54. "filter_system_tables": True, # 是否过滤系统表
  55. "custom_system_prefixes": [], # 用户自定义系统表前缀
  56. "custom_system_schemas": [], # 用户自定义系统schema
  57. # 权限与安全配置
  58. "check_permissions": True, # 是否检查数据库权限
  59. "require_select_permission": True, # 是否要求SELECT权限
  60. "allow_readonly_database": True, # 是否允许只读数据库
  61. # 错误处理配置
  62. "continue_on_error": True, # 遇到错误是否继续
  63. "max_table_failures": 5, # 最大允许失败表数
  64. "skip_large_tables": False, # 是否跳过超大表
  65. "max_table_size": 10000000, # 最大表行数限制
  66. # 文件配置
  67. "ddl_file_suffix": ".ddl",
  68. "doc_file_suffix": "_detail.md",
  69. "log_file": "schema_tools.log",
  70. "create_subdirectories": True, # 是否创建ddl/docs子目录
  71. # 输出格式配置
  72. "include_sample_data_in_comments": True, # 注释中是否包含示例数据
  73. "max_comment_length": 500, # 最大注释长度
  74. "include_field_statistics": True, # 是否包含字段统计信息
  75. # 调试配置
  76. "debug_mode": False, # 调试模式
  77. "save_llm_prompts": False, # 是否保存LLM提示词
  78. "save_llm_responses": False, # 是否保存LLM响应
  79. }
  80. # 从app_config获取相关配置(如果可用)
  81. if app_config:
  82. # 继承数据库配置
  83. if hasattr(app_config, 'PGVECTOR_CONFIG'):
  84. pgvector_config = app_config.PGVECTOR_CONFIG
  85. if not SCHEMA_TOOLS_CONFIG["default_db_connection"]:
  86. SCHEMA_TOOLS_CONFIG["default_db_connection"] = (
  87. f"postgresql://{pgvector_config['user']}:{pgvector_config['password']}"
  88. f"@{pgvector_config['host']}:{pgvector_config['port']}/{pgvector_config['dbname']}"
  89. )
  90. def get_config():
  91. """获取当前配置"""
  92. return SCHEMA_TOOLS_CONFIG
  93. def update_config(**kwargs):
  94. """更新配置"""
  95. SCHEMA_TOOLS_CONFIG.update(kwargs)
  96. def validate_config():
  97. """验证配置有效性"""
  98. errors = []
  99. # 检查必要配置
  100. if SCHEMA_TOOLS_CONFIG["max_concurrent_tables"] <= 0:
  101. errors.append("max_concurrent_tables 必须大于0")
  102. if SCHEMA_TOOLS_CONFIG["sample_data_limit"] <= 0:
  103. errors.append("sample_data_limit 必须大于0")
  104. # 检查处理链配置
  105. default_pipeline = SCHEMA_TOOLS_CONFIG["default_pipeline"]
  106. available_pipelines = SCHEMA_TOOLS_CONFIG["available_pipelines"]
  107. if default_pipeline not in available_pipelines:
  108. errors.append(f"default_pipeline '{default_pipeline}' 不在 available_pipelines 中")
  109. if errors:
  110. raise ValueError("配置验证失败:\n" + "\n".join(f" - {error}" for error in errors))
  111. return True
  112. # 启动时验证配置
  113. try:
  114. validate_config()
  115. except ValueError as e:
  116. print(f"警告: {e}")