vanna_trainer.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. # vanna_trainer.py
  2. import os
  3. import time
  4. import threading
  5. import queue
  6. import concurrent.futures
  7. from functools import lru_cache
  8. from collections import defaultdict
  9. from typing import List, Dict, Any, Tuple, Optional, Union, Callable
  10. import sys
  11. import os
  12. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  13. import app_config
  14. # 设置正确的项目根目录路径
  15. project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  16. # 创建vanna实例
  17. from core.vanna_llm_factory import create_vanna_instance
  18. vn = create_vanna_instance()
  19. # 使用新的配置工具函数获取embedding配置
  20. try:
  21. from common.utils import get_current_embedding_config, get_current_model_info
  22. embedding_config = get_current_embedding_config()
  23. model_info = get_current_model_info()
  24. print(f"\n===== Embedding模型信息 =====")
  25. print(f"模型类型: {model_info['embedding_type']}")
  26. print(f"模型名称: {model_info['embedding_model']}")
  27. print(f"向量维度: {embedding_config.get('embedding_dimension', '未知')}")
  28. if 'base_url' in embedding_config:
  29. print(f"API服务: {embedding_config['base_url']}")
  30. print("==============================")
  31. except ImportError as e:
  32. print(f"警告: 无法导入配置工具函数: {e}")
  33. print("使用默认配置...")
  34. embedding_config = getattr(app_config, 'API_EMBEDDING_CONFIG', {})
  35. print(f"\n===== Embedding模型信息 (默认) =====")
  36. print(f"模型名称: {embedding_config.get('model_name', '未知')}")
  37. print("==============================")
  38. # 从app_config获取训练批处理配置
  39. BATCH_PROCESSING_ENABLED = app_config.TRAINING_BATCH_PROCESSING_ENABLED
  40. BATCH_SIZE = app_config.TRAINING_BATCH_SIZE
  41. MAX_WORKERS = app_config.TRAINING_MAX_WORKERS
  42. # 训练数据批处理器
  43. # 专门用于优化训练过程的批处理器,将多个训练项目打包处理以提高效率
  44. class BatchProcessor:
  45. def __init__(self, batch_size=BATCH_SIZE, max_workers=MAX_WORKERS):
  46. self.batch_size = batch_size
  47. self.max_workers = max_workers
  48. self.batches = defaultdict(list)
  49. self.lock = threading.Lock() # 线程安全锁
  50. # 初始化工作线程池
  51. self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
  52. # 是否启用批处理
  53. self.batch_enabled = BATCH_PROCESSING_ENABLED
  54. print(f"[DEBUG] 训练批处理器初始化: 启用={self.batch_enabled}, 批大小={self.batch_size}, 最大工作线程={self.max_workers}")
  55. def add_item(self, batch_type: str, item: Dict[str, Any]):
  56. """添加一个项目到批处理队列"""
  57. if not self.batch_enabled:
  58. # 如果未启用批处理,直接处理
  59. self._process_single_item(batch_type, item)
  60. return
  61. with self.lock:
  62. self.batches[batch_type].append(item)
  63. if len(self.batches[batch_type]) >= self.batch_size:
  64. batch_items = self.batches[batch_type]
  65. self.batches[batch_type] = []
  66. # 提交批处理任务到线程池
  67. self.executor.submit(self._process_batch, batch_type, batch_items)
  68. def _process_single_item(self, batch_type: str, item: Dict[str, Any]):
  69. """处理单个项目"""
  70. try:
  71. if batch_type == 'ddl':
  72. vn.train(ddl=item['ddl'])
  73. elif batch_type == 'documentation':
  74. vn.train(documentation=item['documentation'])
  75. elif batch_type == 'question_sql':
  76. vn.train(question=item['question'], sql=item['sql'])
  77. print(f"[DEBUG] 单项处理成功: {batch_type}")
  78. except Exception as e:
  79. print(f"[ERROR] 处理 {batch_type} 项目失败: {e}")
  80. def _process_batch(self, batch_type: str, items: List[Dict[str, Any]]):
  81. """处理一批项目"""
  82. print(f"[INFO] 开始批量处理 {len(items)} 个 {batch_type} 项")
  83. start_time = time.time()
  84. try:
  85. # 准备批处理数据
  86. batch_data = []
  87. if batch_type == 'ddl':
  88. for item in items:
  89. batch_data.append({
  90. 'type': 'ddl',
  91. 'content': item['ddl']
  92. })
  93. elif batch_type == 'documentation':
  94. for item in items:
  95. batch_data.append({
  96. 'type': 'documentation',
  97. 'content': item['documentation']
  98. })
  99. elif batch_type == 'question_sql':
  100. for item in items:
  101. batch_data.append({
  102. 'type': 'question_sql',
  103. 'question': item['question'],
  104. 'sql': item['sql']
  105. })
  106. # 使用批量添加方法
  107. if hasattr(vn, 'add_batch') and callable(getattr(vn, 'add_batch')):
  108. success = vn.add_batch(batch_data)
  109. if success:
  110. print(f"[INFO] 批量处理成功: {len(items)} 个 {batch_type} 项")
  111. else:
  112. print(f"[WARNING] 批量处理部分失败: {batch_type}")
  113. else:
  114. # 如果没有批处理方法,退回到逐条处理
  115. print(f"[WARNING] 批处理不可用,使用逐条处理: {batch_type}")
  116. for item in items:
  117. self._process_single_item(batch_type, item)
  118. except Exception as e:
  119. print(f"[ERROR] 批处理 {batch_type} 失败: {e}")
  120. # 如果批处理失败,尝试逐条处理
  121. print(f"[INFO] 尝试逐条处理...")
  122. for item in items:
  123. try:
  124. self._process_single_item(batch_type, item)
  125. except Exception as item_e:
  126. print(f"[ERROR] 处理项目失败: {item_e}")
  127. elapsed = time.time() - start_time
  128. print(f"[INFO] 批处理完成 {len(items)} 个 {batch_type} 项,耗时 {elapsed:.2f} 秒")
  129. def flush_all(self):
  130. """强制处理所有剩余项目"""
  131. with self.lock:
  132. for batch_type, items in self.batches.items():
  133. if items:
  134. print(f"[INFO] 正在处理剩余的 {len(items)} 个 {batch_type} 项")
  135. self._process_batch(batch_type, items)
  136. # 清空队列
  137. self.batches = defaultdict(list)
  138. print("[INFO] 所有训练批处理项目已完成")
  139. def shutdown(self):
  140. """关闭处理器和线程池"""
  141. self.flush_all()
  142. self.executor.shutdown(wait=True)
  143. print("[INFO] 训练批处理器已关闭")
  144. # 创建全局训练批处理器实例
  145. # 用于所有训练函数的批处理优化
  146. batch_processor = BatchProcessor()
  147. # 原始训练函数的批处理增强版本
  148. def train_ddl(ddl_sql: str):
  149. print(f"[DDL] Training on DDL:\n{ddl_sql}")
  150. batch_processor.add_item('ddl', {'ddl': ddl_sql})
  151. def train_documentation(doc: str):
  152. print(f"[DOC] Training on documentation:\n{doc}")
  153. batch_processor.add_item('documentation', {'documentation': doc})
  154. def train_sql_example(sql: str):
  155. """训练单个SQL示例,通过SQL生成相应的问题"""
  156. print(f"[SQL] Training on SQL:\n{sql}")
  157. try:
  158. # 直接调用generate_question方法
  159. question = vn.generate_question(sql=sql)
  160. question = question.strip()
  161. if not question.endswith("?") and not question.endswith("?"):
  162. question += "?"
  163. except Exception as e:
  164. print(f"[ERROR] 生成问题时出错: {e}")
  165. raise Exception(f"无法为SQL生成问题: {e}")
  166. print(f"[SQL] 生成问题: {question}")
  167. # 使用标准方式存储问题-SQL对
  168. batch_processor.add_item('question_sql', {'question': question, 'sql': sql})
  169. def train_question_sql_pair(question: str, sql: str):
  170. print(f"[Q-S] Training on:\nquestion: {question}\nsql: {sql}")
  171. batch_processor.add_item('question_sql', {'question': question, 'sql': sql})
  172. # 完成训练后刷新所有待处理项
  173. def flush_training():
  174. """强制处理所有待处理的训练项目"""
  175. batch_processor.flush_all()
  176. # 关闭训练器
  177. def shutdown_trainer():
  178. """关闭训练器和相关资源"""
  179. batch_processor.shutdown()